diff --git a/4.0/_modules/index.html b/4.0/_modules/index.html index e7be3c0..8d60c10 100644 --- a/4.0/_modules/index.html +++ b/4.0/_modules/index.html @@ -211,5 +211,5 @@

All modules for which code is available

}); - + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/lm/fasttext.html b/4.0/_modules/pythainlp/augment/lm/fasttext.html index a7d5927..0676be5 100644 --- a/4.0/_modules/pythainlp/augment/lm/fasttext.html +++ b/4.0/_modules/pythainlp/augment/lm/fasttext.html @@ -217,5 +217,5 @@

Source code for pythainlp.augment.lm.fasttext

- + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/lm/wangchanberta.html b/4.0/_modules/pythainlp/augment/lm/wangchanberta.html index 700990a..f20583d 100644 --- a/4.0/_modules/pythainlp/augment/lm/wangchanberta.html +++ b/4.0/_modules/pythainlp/augment/lm/wangchanberta.html @@ -219,5 +219,5 @@

Source code for pythainlp.augment.lm.wangchanberta

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html b/4.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html index 5b22585..70d39d7 100644 --- a/4.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html +++ b/4.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html @@ -204,5 +204,5 @@

Source code for pythainlp.augment.word2vec.bpemb_wv

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/word2vec/core.html b/4.0/_modules/pythainlp/augment/word2vec/core.html index 63f84f9..3800689 100644 --- a/4.0/_modules/pythainlp/augment/word2vec/core.html +++ b/4.0/_modules/pythainlp/augment/word2vec/core.html @@ -201,5 +201,5 @@

Source code for pythainlp.augment.word2vec.core

< }); - + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/word2vec/ltw2v.html b/4.0/_modules/pythainlp/augment/word2vec/ltw2v.html index 409f120..c86dca6 100644 --- a/4.0/_modules/pythainlp/augment/word2vec/ltw2v.html +++ b/4.0/_modules/pythainlp/augment/word2vec/ltw2v.html @@ -194,5 +194,5 @@

Source code for pythainlp.augment.word2vec.ltw2v

}); - + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/word2vec/thai2fit.html b/4.0/_modules/pythainlp/augment/word2vec/thai2fit.html index d033061..b7ab2c2 100644 --- a/4.0/_modules/pythainlp/augment/word2vec/thai2fit.html +++ b/4.0/_modules/pythainlp/augment/word2vec/thai2fit.html @@ -194,5 +194,5 @@

Source code for pythainlp.augment.word2vec.thai2fit

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/augment/wordnet.html b/4.0/_modules/pythainlp/augment/wordnet.html index 2d884f8..6406a3f 100644 --- a/4.0/_modules/pythainlp/augment/wordnet.html +++ b/4.0/_modules/pythainlp/augment/wordnet.html @@ -352,5 +352,5 @@

Source code for pythainlp.augment.wordnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/benchmarks/word_tokenization.html b/4.0/_modules/pythainlp/benchmarks/word_tokenization.html
index 7a143bd..9038c89 100644
--- a/4.0/_modules/pythainlp/benchmarks/word_tokenization.html
+++ b/4.0/_modules/pythainlp/benchmarks/word_tokenization.html
@@ -414,5 +414,5 @@ 

Source code for pythainlp.benchmarks.word_tokenization

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/corpus/common.html b/4.0/_modules/pythainlp/corpus/common.html index 0f221e4..6b9de93 100644 --- a/4.0/_modules/pythainlp/corpus/common.html +++ b/4.0/_modules/pythainlp/corpus/common.html @@ -366,5 +366,5 @@

Source code for pythainlp.corpus.common

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/conceptnet.html b/4.0/_modules/pythainlp/corpus/conceptnet.html
index dd06fef..0ee6ea6 100644
--- a/4.0/_modules/pythainlp/corpus/conceptnet.html
+++ b/4.0/_modules/pythainlp/corpus/conceptnet.html
@@ -248,5 +248,5 @@ 

Source code for pythainlp.corpus.conceptnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/core.html b/4.0/_modules/pythainlp/corpus/core.html
index 4a76b67..036870d 100644
--- a/4.0/_modules/pythainlp/corpus/core.html
+++ b/4.0/_modules/pythainlp/corpus/core.html
@@ -687,5 +687,5 @@ 

Source code for pythainlp.corpus.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/oscar.html b/4.0/_modules/pythainlp/corpus/oscar.html
index 64ef1b9..5226420 100644
--- a/4.0/_modules/pythainlp/corpus/oscar.html
+++ b/4.0/_modules/pythainlp/corpus/oscar.html
@@ -194,5 +194,5 @@ 

Source code for pythainlp.corpus.oscar

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/tnc.html b/4.0/_modules/pythainlp/corpus/tnc.html
index fc0c56f..5549c2e 100644
--- a/4.0/_modules/pythainlp/corpus/tnc.html
+++ b/4.0/_modules/pythainlp/corpus/tnc.html
@@ -221,5 +221,5 @@ 

Source code for pythainlp.corpus.tnc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/ttc.html b/4.0/_modules/pythainlp/corpus/ttc.html
index 5a06e07..169b059 100644
--- a/4.0/_modules/pythainlp/corpus/ttc.html
+++ b/4.0/_modules/pythainlp/corpus/ttc.html
@@ -185,5 +185,5 @@ 

Source code for pythainlp.corpus.ttc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/util.html b/4.0/_modules/pythainlp/corpus/util.html
index eeb0a0b..e77ded4 100644
--- a/4.0/_modules/pythainlp/corpus/util.html
+++ b/4.0/_modules/pythainlp/corpus/util.html
@@ -280,5 +280,5 @@ 

Source code for pythainlp.corpus.util

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/corpus/wordnet.html b/4.0/_modules/pythainlp/corpus/wordnet.html
index 1e6422d..bbef113 100644
--- a/4.0/_modules/pythainlp/corpus/wordnet.html
+++ b/4.0/_modules/pythainlp/corpus/wordnet.html
@@ -579,5 +579,5 @@ 

Source code for pythainlp.corpus.wordnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/generate/core.html b/4.0/_modules/pythainlp/generate/core.html
index 75c0666..7a0ff1c 100644
--- a/4.0/_modules/pythainlp/generate/core.html
+++ b/4.0/_modules/pythainlp/generate/core.html
@@ -433,5 +433,5 @@ 

Source code for pythainlp.generate.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/generate/thai2fit.html b/4.0/_modules/pythainlp/generate/thai2fit.html
index 2c16a8c..6acc207 100644
--- a/4.0/_modules/pythainlp/generate/thai2fit.html
+++ b/4.0/_modules/pythainlp/generate/thai2fit.html
@@ -251,5 +251,5 @@ 

Source code for pythainlp.generate.thai2fit

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/khavee/core.html b/4.0/_modules/pythainlp/khavee/core.html
index 0687fb6..d5612b3 100644
--- a/4.0/_modules/pythainlp/khavee/core.html
+++ b/4.0/_modules/pythainlp/khavee/core.html
@@ -581,5 +581,5 @@ 

Source code for pythainlp.khavee.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/parse/core.html b/4.0/_modules/pythainlp/parse/core.html
index aaa827d..c38f7c1 100644
--- a/4.0/_modules/pythainlp/parse/core.html
+++ b/4.0/_modules/pythainlp/parse/core.html
@@ -252,5 +252,5 @@ 

Source code for pythainlp.parse.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/soundex/core.html b/4.0/_modules/pythainlp/soundex/core.html
index b05fa9a..c23baa4 100644
--- a/4.0/_modules/pythainlp/soundex/core.html
+++ b/4.0/_modules/pythainlp/soundex/core.html
@@ -217,5 +217,5 @@ 

Source code for pythainlp.soundex.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/soundex/lk82.html b/4.0/_modules/pythainlp/soundex/lk82.html
index bb923e2..2ecd646 100644
--- a/4.0/_modules/pythainlp/soundex/lk82.html
+++ b/4.0/_modules/pythainlp/soundex/lk82.html
@@ -260,5 +260,5 @@ 

Source code for pythainlp.soundex.lk82

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/soundex/metasound.html b/4.0/_modules/pythainlp/soundex/metasound.html
index da08e85..39a4ea5 100644
--- a/4.0/_modules/pythainlp/soundex/metasound.html
+++ b/4.0/_modules/pythainlp/soundex/metasound.html
@@ -244,5 +244,5 @@ 

Source code for pythainlp.soundex.metasound

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/soundex/prayut_and_somchaip.html b/4.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
index 73cc778..3f4456d 100644
--- a/4.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
+++ b/4.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
@@ -225,5 +225,5 @@ 

Source code for pythainlp.soundex.prayut_and_somchaip

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/soundex/udom83.html b/4.0/_modules/pythainlp/soundex/udom83.html index 86f5d8a..601612f 100644 --- a/4.0/_modules/pythainlp/soundex/udom83.html +++ b/4.0/_modules/pythainlp/soundex/udom83.html @@ -231,5 +231,5 @@

Source code for pythainlp.soundex.udom83

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/spell/core.html b/4.0/_modules/pythainlp/spell/core.html
index e229f08..7b3ea55 100644
--- a/4.0/_modules/pythainlp/spell/core.html
+++ b/4.0/_modules/pythainlp/spell/core.html
@@ -321,5 +321,5 @@ 

Source code for pythainlp.spell.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/spell/pn.html b/4.0/_modules/pythainlp/spell/pn.html
index a315411..7a87f72 100644
--- a/4.0/_modules/pythainlp/spell/pn.html
+++ b/4.0/_modules/pythainlp/spell/pn.html
@@ -497,5 +497,5 @@ 

Source code for pythainlp.spell.pn

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/summarize/core.html b/4.0/_modules/pythainlp/summarize/core.html
index 8b46324..69441c3 100644
--- a/4.0/_modules/pythainlp/summarize/core.html
+++ b/4.0/_modules/pythainlp/summarize/core.html
@@ -388,5 +388,5 @@ 

Source code for pythainlp.summarize.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/summarize/keybert.html b/4.0/_modules/pythainlp/summarize/keybert.html
index 0e06778..1785ea9 100644
--- a/4.0/_modules/pythainlp/summarize/keybert.html
+++ b/4.0/_modules/pythainlp/summarize/keybert.html
@@ -365,5 +365,5 @@ 

Source code for pythainlp.summarize.keybert

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/chunk.html b/4.0/_modules/pythainlp/tag/chunk.html
index d17e50a..3dc2520 100644
--- a/4.0/_modules/pythainlp/tag/chunk.html
+++ b/4.0/_modules/pythainlp/tag/chunk.html
@@ -171,5 +171,5 @@ 

Source code for pythainlp.tag.chunk

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/locations.html b/4.0/_modules/pythainlp/tag/locations.html
index 3dec2ea..c4f7f4c 100644
--- a/4.0/_modules/pythainlp/tag/locations.html
+++ b/4.0/_modules/pythainlp/tag/locations.html
@@ -174,5 +174,5 @@ 

Source code for pythainlp.tag.locations

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/named_entity.html b/4.0/_modules/pythainlp/tag/named_entity.html
index a06b6e8..dd8864a 100644
--- a/4.0/_modules/pythainlp/tag/named_entity.html
+++ b/4.0/_modules/pythainlp/tag/named_entity.html
@@ -297,5 +297,5 @@ 

Source code for pythainlp.tag.named_entity

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/pos_tag.html b/4.0/_modules/pythainlp/tag/pos_tag.html
index 6ed2312..dc02c0b 100644
--- a/4.0/_modules/pythainlp/tag/pos_tag.html
+++ b/4.0/_modules/pythainlp/tag/pos_tag.html
@@ -304,5 +304,5 @@ 

Source code for pythainlp.tag.pos_tag

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/thainer.html b/4.0/_modules/pythainlp/tag/thainer.html
index 3c4359e..6594e1e 100644
--- a/4.0/_modules/pythainlp/tag/thainer.html
+++ b/4.0/_modules/pythainlp/tag/thainer.html
@@ -344,5 +344,5 @@ 

Source code for pythainlp.tag.thainer

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tag/tltk.html b/4.0/_modules/pythainlp/tag/tltk.html
index 3f9c588..d009820 100644
--- a/4.0/_modules/pythainlp/tag/tltk.html
+++ b/4.0/_modules/pythainlp/tag/tltk.html
@@ -233,5 +233,5 @@ 

Source code for pythainlp.tag.tltk

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/attacut.html b/4.0/_modules/pythainlp/tokenize/attacut.html
index bdcf1be..053f627 100644
--- a/4.0/_modules/pythainlp/tokenize/attacut.html
+++ b/4.0/_modules/pythainlp/tokenize/attacut.html
@@ -181,5 +181,5 @@ 

Source code for pythainlp.tokenize.attacut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/core.html b/4.0/_modules/pythainlp/tokenize/core.html
index 6088025..a36e441 100644
--- a/4.0/_modules/pythainlp/tokenize/core.html
+++ b/4.0/_modules/pythainlp/tokenize/core.html
@@ -792,5 +792,5 @@ 

Source code for pythainlp.tokenize.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/crfcut.html b/4.0/_modules/pythainlp/tokenize/crfcut.html
index ef712da..f13542e 100644
--- a/4.0/_modules/pythainlp/tokenize/crfcut.html
+++ b/4.0/_modules/pythainlp/tokenize/crfcut.html
@@ -347,5 +347,5 @@ 

Source code for pythainlp.tokenize.crfcut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/deepcut.html b/4.0/_modules/pythainlp/tokenize/deepcut.html
index d8a73af..559fd15 100644
--- a/4.0/_modules/pythainlp/tokenize/deepcut.html
+++ b/4.0/_modules/pythainlp/tokenize/deepcut.html
@@ -172,5 +172,5 @@ 

Source code for pythainlp.tokenize.deepcut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/etcc.html b/4.0/_modules/pythainlp/tokenize/etcc.html
index bb75412..5181db1 100644
--- a/4.0/_modules/pythainlp/tokenize/etcc.html
+++ b/4.0/_modules/pythainlp/tokenize/etcc.html
@@ -203,5 +203,5 @@ 

Source code for pythainlp.tokenize.etcc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/longest.html b/4.0/_modules/pythainlp/tokenize/longest.html
index 59e5599..7731945 100644
--- a/4.0/_modules/pythainlp/tokenize/longest.html
+++ b/4.0/_modules/pythainlp/tokenize/longest.html
@@ -283,5 +283,5 @@ 

Source code for pythainlp.tokenize.longest

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/multi_cut.html b/4.0/_modules/pythainlp/tokenize/multi_cut.html
index 995e072..334325b 100644
--- a/4.0/_modules/pythainlp/tokenize/multi_cut.html
+++ b/4.0/_modules/pythainlp/tokenize/multi_cut.html
@@ -298,5 +298,5 @@ 

Source code for pythainlp.tokenize.multi_cut

 
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/nercut.html b/4.0/_modules/pythainlp/tokenize/nercut.html
index b2d5a78..b7541db 100644
--- a/4.0/_modules/pythainlp/tokenize/nercut.html
+++ b/4.0/_modules/pythainlp/tokenize/nercut.html
@@ -215,5 +215,5 @@ 

Source code for pythainlp.tokenize.nercut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/newmm.html b/4.0/_modules/pythainlp/tokenize/newmm.html
index fe59ff6..837fe44 100644
--- a/4.0/_modules/pythainlp/tokenize/newmm.html
+++ b/4.0/_modules/pythainlp/tokenize/newmm.html
@@ -346,5 +346,5 @@ 

Source code for pythainlp.tokenize.newmm

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/nlpo3.html b/4.0/_modules/pythainlp/tokenize/nlpo3.html
index a2a97c6..657ce85 100644
--- a/4.0/_modules/pythainlp/tokenize/nlpo3.html
+++ b/4.0/_modules/pythainlp/tokenize/nlpo3.html
@@ -206,5 +206,5 @@ 

Source code for pythainlp.tokenize.nlpo3

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/oskut.html b/4.0/_modules/pythainlp/tokenize/oskut.html
index a158337..bbe6589 100644
--- a/4.0/_modules/pythainlp/tokenize/oskut.html
+++ b/4.0/_modules/pythainlp/tokenize/oskut.html
@@ -164,5 +164,5 @@ 

Source code for pythainlp.tokenize.oskut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/pyicu.html b/4.0/_modules/pythainlp/tokenize/pyicu.html
index af6b84f..b5c7ce0 100644
--- a/4.0/_modules/pythainlp/tokenize/pyicu.html
+++ b/4.0/_modules/pythainlp/tokenize/pyicu.html
@@ -174,5 +174,5 @@ 

Source code for pythainlp.tokenize.pyicu

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/sefr_cut.html b/4.0/_modules/pythainlp/tokenize/sefr_cut.html
index 6095694..ab1fe9f 100644
--- a/4.0/_modules/pythainlp/tokenize/sefr_cut.html
+++ b/4.0/_modules/pythainlp/tokenize/sefr_cut.html
@@ -163,5 +163,5 @@ 

Source code for pythainlp.tokenize.sefr_cut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/tcc.html b/4.0/_modules/pythainlp/tokenize/tcc.html
index cdb13b6..26e88fb 100644
--- a/4.0/_modules/pythainlp/tokenize/tcc.html
+++ b/4.0/_modules/pythainlp/tokenize/tcc.html
@@ -252,5 +252,5 @@ 

Source code for pythainlp.tokenize.tcc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/tcc_p.html b/4.0/_modules/pythainlp/tokenize/tcc_p.html
index 43f18a2..5cde068 100644
--- a/4.0/_modules/pythainlp/tokenize/tcc_p.html
+++ b/4.0/_modules/pythainlp/tokenize/tcc_p.html
@@ -252,5 +252,5 @@ 

Source code for pythainlp.tokenize.tcc_p

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tokenize/thaisumcut.html b/4.0/_modules/pythainlp/tokenize/thaisumcut.html
index d7c1c41..3cbc525 100644
--- a/4.0/_modules/pythainlp/tokenize/thaisumcut.html
+++ b/4.0/_modules/pythainlp/tokenize/thaisumcut.html
@@ -517,5 +517,5 @@ 

Source code for pythainlp.tokenize.thaisumcut

- + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/tools/misspell.html b/4.0/_modules/pythainlp/tools/misspell.html index a13229e..d623df4 100644 --- a/4.0/_modules/pythainlp/tools/misspell.html +++ b/4.0/_modules/pythainlp/tools/misspell.html @@ -280,5 +280,5 @@

Source code for pythainlp.tools.misspell

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/tools/path.html b/4.0/_modules/pythainlp/tools/path.html
index 46266b8..1d5b1f9 100644
--- a/4.0/_modules/pythainlp/tools/path.html
+++ b/4.0/_modules/pythainlp/tools/path.html
@@ -214,5 +214,5 @@ 

Source code for pythainlp.tools.path

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/translate/core.html b/4.0/_modules/pythainlp/translate/core.html
index 6625c7a..7fadad9 100644
--- a/4.0/_modules/pythainlp/translate/core.html
+++ b/4.0/_modules/pythainlp/translate/core.html
@@ -229,5 +229,5 @@ 

Source code for pythainlp.translate.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/translate/en_th.html b/4.0/_modules/pythainlp/translate/en_th.html
index 591b19b..fbda060 100644
--- a/4.0/_modules/pythainlp/translate/en_th.html
+++ b/4.0/_modules/pythainlp/translate/en_th.html
@@ -301,5 +301,5 @@ 

Source code for pythainlp.translate.en_th

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/translate/th_fr.html b/4.0/_modules/pythainlp/translate/th_fr.html
index 1d908cd..f13182f 100644
--- a/4.0/_modules/pythainlp/translate/th_fr.html
+++ b/4.0/_modules/pythainlp/translate/th_fr.html
@@ -206,5 +206,5 @@ 

Source code for pythainlp.translate.th_fr

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/translate/zh_th.html b/4.0/_modules/pythainlp/translate/zh_th.html
index a99d278..1030205 100644
--- a/4.0/_modules/pythainlp/translate/zh_th.html
+++ b/4.0/_modules/pythainlp/translate/zh_th.html
@@ -251,5 +251,5 @@ 

Source code for pythainlp.translate.zh_th

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/transliterate/core.html b/4.0/_modules/pythainlp/transliterate/core.html
index 2c26216..15c773c 100644
--- a/4.0/_modules/pythainlp/transliterate/core.html
+++ b/4.0/_modules/pythainlp/transliterate/core.html
@@ -336,5 +336,5 @@ 

Source code for pythainlp.transliterate.core

 
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/transliterate/ipa.html b/4.0/_modules/pythainlp/transliterate/ipa.html
index 73ef29d..3266467 100644
--- a/4.0/_modules/pythainlp/transliterate/ipa.html
+++ b/4.0/_modules/pythainlp/transliterate/ipa.html
@@ -165,5 +165,5 @@ 

Source code for pythainlp.transliterate.ipa

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/transliterate/iso_11940.html b/4.0/_modules/pythainlp/transliterate/iso_11940.html
index b31da14..66c5bb2 100644
--- a/4.0/_modules/pythainlp/transliterate/iso_11940.html
+++ b/4.0/_modules/pythainlp/transliterate/iso_11940.html
@@ -274,5 +274,5 @@ 

Source code for pythainlp.transliterate.iso_11940

- + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/transliterate/pyicu.html b/4.0/_modules/pythainlp/transliterate/pyicu.html index fe48f4a..3193a1d 100644 --- a/4.0/_modules/pythainlp/transliterate/pyicu.html +++ b/4.0/_modules/pythainlp/transliterate/pyicu.html @@ -160,5 +160,5 @@

Source code for pythainlp.transliterate.pyicu

- + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/transliterate/spoonerism.html b/4.0/_modules/pythainlp/transliterate/spoonerism.html index dff5c00..c58af68 100644 --- a/4.0/_modules/pythainlp/transliterate/spoonerism.html +++ b/4.0/_modules/pythainlp/transliterate/spoonerism.html @@ -215,5 +215,5 @@

Source code for pythainlp.transliterate.spoonerism

\ No newline at end of file diff --git a/4.0/_modules/pythainlp/transliterate/thaig2p.html b/4.0/_modules/pythainlp/transliterate/thaig2p.html index b0ade86..409e3e1 100644 --- a/4.0/_modules/pythainlp/transliterate/thaig2p.html +++ b/4.0/_modules/pythainlp/transliterate/thaig2p.html @@ -501,5 +501,5 @@

Source code for pythainlp.transliterate.thaig2p

< }); - + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/transliterate/tltk.html b/4.0/_modules/pythainlp/transliterate/tltk.html index 7da0e04..6e970b1 100644 --- a/4.0/_modules/pythainlp/transliterate/tltk.html +++ b/4.0/_modules/pythainlp/transliterate/tltk.html @@ -166,5 +166,5 @@

Source code for pythainlp.transliterate.tltk

 
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/transliterate/wunsen.html b/4.0/_modules/pythainlp/transliterate/wunsen.html
index 24ea053..ea0ce39 100644
--- a/4.0/_modules/pythainlp/transliterate/wunsen.html
+++ b/4.0/_modules/pythainlp/transliterate/wunsen.html
@@ -282,5 +282,5 @@ 

Source code for pythainlp.transliterate.wunsen

- + \ No newline at end of file diff --git a/4.0/_modules/pythainlp/ulmfit/core.html b/4.0/_modules/pythainlp/ulmfit/core.html index 1a3eb49..d1e0fb1 100644 --- a/4.0/_modules/pythainlp/ulmfit/core.html +++ b/4.0/_modules/pythainlp/ulmfit/core.html @@ -395,5 +395,5 @@

Source code for pythainlp.ulmfit.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/ulmfit/preprocess.html b/4.0/_modules/pythainlp/ulmfit/preprocess.html
index 02f8eff..3fd3df6 100644
--- a/4.0/_modules/pythainlp/ulmfit/preprocess.html
+++ b/4.0/_modules/pythainlp/ulmfit/preprocess.html
@@ -427,5 +427,5 @@ 

Source code for pythainlp.ulmfit.preprocess

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/ulmfit/tokenizer.html b/4.0/_modules/pythainlp/ulmfit/tokenizer.html
index 9acf8f2..3c35f90 100644
--- a/4.0/_modules/pythainlp/ulmfit/tokenizer.html
+++ b/4.0/_modules/pythainlp/ulmfit/tokenizer.html
@@ -206,5 +206,5 @@ 

Source code for pythainlp.ulmfit.tokenizer

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/collate.html b/4.0/_modules/pythainlp/util/collate.html
index 5adce78..9d6e400 100644
--- a/4.0/_modules/pythainlp/util/collate.html
+++ b/4.0/_modules/pythainlp/util/collate.html
@@ -188,5 +188,5 @@ 

Source code for pythainlp.util.collate

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/date.html b/4.0/_modules/pythainlp/util/date.html
index d015947..16da3c2 100644
--- a/4.0/_modules/pythainlp/util/date.html
+++ b/4.0/_modules/pythainlp/util/date.html
@@ -534,5 +534,5 @@ 

Source code for pythainlp.util.date

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/digitconv.html b/4.0/_modules/pythainlp/util/digitconv.html
index f3c9fb1..8742f01 100644
--- a/4.0/_modules/pythainlp/util/digitconv.html
+++ b/4.0/_modules/pythainlp/util/digitconv.html
@@ -336,5 +336,5 @@ 

Source code for pythainlp.util.digitconv

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/emojiconv.html b/4.0/_modules/pythainlp/util/emojiconv.html
index 39e08f9..666a028 100644
--- a/4.0/_modules/pythainlp/util/emojiconv.html
+++ b/4.0/_modules/pythainlp/util/emojiconv.html
@@ -2000,5 +2000,5 @@ 

Source code for pythainlp.util.emojiconv

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/keyboard.html b/4.0/_modules/pythainlp/util/keyboard.html
index ada57cd..0c0ddf9 100644
--- a/4.0/_modules/pythainlp/util/keyboard.html
+++ b/4.0/_modules/pythainlp/util/keyboard.html
@@ -363,5 +363,5 @@ 

Source code for pythainlp.util.keyboard

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/keywords.html b/4.0/_modules/pythainlp/util/keywords.html
index 138c319..a7f315c 100644
--- a/4.0/_modules/pythainlp/util/keywords.html
+++ b/4.0/_modules/pythainlp/util/keywords.html
@@ -240,5 +240,5 @@ 

Source code for pythainlp.util.keywords

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/normalize.html b/4.0/_modules/pythainlp/util/normalize.html
index ca537db..b809e62 100644
--- a/4.0/_modules/pythainlp/util/normalize.html
+++ b/4.0/_modules/pythainlp/util/normalize.html
@@ -428,5 +428,5 @@ 

Source code for pythainlp.util.normalize

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/numtoword.html b/4.0/_modules/pythainlp/util/numtoword.html
index ba959a6..f2bcfa8 100644
--- a/4.0/_modules/pythainlp/util/numtoword.html
+++ b/4.0/_modules/pythainlp/util/numtoword.html
@@ -258,5 +258,5 @@ 

Source code for pythainlp.util.numtoword

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/phoneme.html b/4.0/_modules/pythainlp/util/phoneme.html
index b351ff4..d786480 100644
--- a/4.0/_modules/pythainlp/util/phoneme.html
+++ b/4.0/_modules/pythainlp/util/phoneme.html
@@ -373,5 +373,5 @@ 

Source code for pythainlp.util.phoneme

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/strftime.html b/4.0/_modules/pythainlp/util/strftime.html
index eb0bc33..1f7626a 100644
--- a/4.0/_modules/pythainlp/util/strftime.html
+++ b/4.0/_modules/pythainlp/util/strftime.html
@@ -475,5 +475,5 @@ 

Source code for pythainlp.util.strftime

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/syllable.html b/4.0/_modules/pythainlp/util/syllable.html
index 542636e..213961d 100644
--- a/4.0/_modules/pythainlp/util/syllable.html
+++ b/4.0/_modules/pythainlp/util/syllable.html
@@ -454,5 +454,5 @@ 

Source code for pythainlp.util.syllable

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/thai.html b/4.0/_modules/pythainlp/util/thai.html
index 27106fa..8216158 100644
--- a/4.0/_modules/pythainlp/util/thai.html
+++ b/4.0/_modules/pythainlp/util/thai.html
@@ -405,5 +405,5 @@ 

Source code for pythainlp.util.thai

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/thaiwordcheck.html b/4.0/_modules/pythainlp/util/thaiwordcheck.html
index fb980bd..e423a50 100644
--- a/4.0/_modules/pythainlp/util/thaiwordcheck.html
+++ b/4.0/_modules/pythainlp/util/thaiwordcheck.html
@@ -266,5 +266,5 @@ 

Source code for pythainlp.util.thaiwordcheck

 
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/time.html b/4.0/_modules/pythainlp/util/time.html
index 4d2d8ab..c8e1338 100644
--- a/4.0/_modules/pythainlp/util/time.html
+++ b/4.0/_modules/pythainlp/util/time.html
@@ -468,5 +468,5 @@ 

Source code for pythainlp.util.time

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/trie.html b/4.0/_modules/pythainlp/util/trie.html
index 60c3dde..44ef6f3 100644
--- a/4.0/_modules/pythainlp/util/trie.html
+++ b/4.0/_modules/pythainlp/util/trie.html
@@ -266,5 +266,5 @@ 

Source code for pythainlp.util.trie

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/util/wordtonum.html b/4.0/_modules/pythainlp/util/wordtonum.html
index 7f270b5..c673996 100644
--- a/4.0/_modules/pythainlp/util/wordtonum.html
+++ b/4.0/_modules/pythainlp/util/wordtonum.html
@@ -353,5 +353,5 @@ 

Source code for pythainlp.util.wordtonum

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/wangchanberta/core.html b/4.0/_modules/pythainlp/wangchanberta/core.html
index d5c3d03..98d1db9 100644
--- a/4.0/_modules/pythainlp/wangchanberta/core.html
+++ b/4.0/_modules/pythainlp/wangchanberta/core.html
@@ -358,5 +358,5 @@ 

Source code for pythainlp.wangchanberta.core

 
 
-
+
 
\ No newline at end of file
diff --git a/4.0/_modules/pythainlp/word_vector/core.html b/4.0/_modules/pythainlp/word_vector/core.html
index bf441cc..cdbcf90 100644
--- a/4.0/_modules/pythainlp/word_vector/core.html
+++ b/4.0/_modules/pythainlp/word_vector/core.html
@@ -457,5 +457,5 @@ 

Source code for pythainlp.word_vector.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/4.0/api/augment.html b/4.0/api/augment.html
index e4b1a83..afb8e74 100644
--- a/4.0/api/augment.html
+++ b/4.0/api/augment.html
@@ -625,5 +625,5 @@ 

Modules \ No newline at end of file diff --git a/4.0/api/benchmarks.html b/4.0/api/benchmarks.html index b35891e..f9813e1 100644 --- a/4.0/api/benchmarks.html +++ b/4.0/api/benchmarks.html @@ -235,5 +235,5 @@

Quality \ No newline at end of file diff --git a/4.0/api/corpus.html b/4.0/api/corpus.html index 2e7cafd..558f0c7 100644 --- a/4.0/api/corpus.html +++ b/4.0/api/corpus.html @@ -1385,5 +1385,5 @@

Definition \ No newline at end of file diff --git a/4.0/api/generate.html b/4.0/api/generate.html index 074a0ee..4b68263 100644 --- a/4.0/api/generate.html +++ b/4.0/api/generate.html @@ -381,5 +381,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/khavee.html b/4.0/api/khavee.html index d7f89da..f81daa2 100644 --- a/4.0/api/khavee.html +++ b/4.0/api/khavee.html @@ -334,5 +334,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/parse.html b/4.0/api/parse.html index 647b73a..e9efa67 100644 --- a/4.0/api/parse.html +++ b/4.0/api/parse.html @@ -214,5 +214,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/soundex.html b/4.0/api/soundex.html index d31031e..40e7efa 100644 --- a/4.0/api/soundex.html +++ b/4.0/api/soundex.html @@ -393,5 +393,5 @@

References \ No newline at end of file diff --git a/4.0/api/spell.html b/4.0/api/spell.html index 5e24f16..0586c61 100644 --- a/4.0/api/spell.html +++ b/4.0/api/spell.html @@ -610,5 +610,5 @@

References \ No newline at end of file diff --git a/4.0/api/summarize.html b/4.0/api/summarize.html index 84933c9..60878b1 100644 --- a/4.0/api/summarize.html +++ b/4.0/api/summarize.html @@ -492,5 +492,5 @@

Keyword Extraction Engines \ No newline at end of file diff --git a/4.0/api/tag.html b/4.0/api/tag.html index b76132b..e28f6bd 100644 --- a/4.0/api/tag.html +++ b/4.0/api/tag.html @@ -1310,5 +1310,5 @@

References \ No newline at end of file diff --git a/4.0/api/tokenize.html b/4.0/api/tokenize.html index fb21ea2..d55fca1 100644 --- a/4.0/api/tokenize.html +++ b/4.0/api/tokenize.html @@ -1641,5 +1641,5 @@

Subword level \ No newline at end of file diff --git a/4.0/api/tools.html b/4.0/api/tools.html index 5b7fec9..c99641b 100644 --- a/4.0/api/tools.html +++ b/4.0/api/tools.html @@ -250,5 +250,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/translate.html b/4.0/api/translate.html index 4d4f897..fe72d1a 100644 --- a/4.0/api/translate.html +++ b/4.0/api/translate.html @@ -496,5 +496,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/transliterate.html b/4.0/api/transliterate.html index f9c2120..cab8528 100644 --- a/4.0/api/transliterate.html +++ b/4.0/api/transliterate.html @@ -671,5 +671,5 @@

References \ No newline at end of file diff --git a/4.0/api/ulmfit.html b/4.0/api/ulmfit.html index c9bacfe..1573d76 100644 --- a/4.0/api/ulmfit.html +++ b/4.0/api/ulmfit.html @@ -586,5 +586,5 @@

Modules \ No newline at end of file diff --git a/4.0/api/util.html b/4.0/api/util.html index 02222f3..a1b487e 100644 --- a/4.0/api/util.html +++ b/4.0/api/util.html @@ -1860,5 +1860,5 @@

References \ No newline at end of file diff --git a/4.0/api/wangchanberta.html b/4.0/api/wangchanberta.html index 5f4df3f..0393040 100644 --- a/4.0/api/wangchanberta.html +++ b/4.0/api/wangchanberta.html @@ -293,5 +293,5 @@

References \ No newline at end of file diff --git a/4.0/api/word_vector.html b/4.0/api/word_vector.html index 531d215..381eebc 100644 --- a/4.0/api/word_vector.html +++ b/4.0/api/word_vector.html @@ -496,5 +496,5 @@

References \ No newline at end of file diff --git a/4.0/genindex.html b/4.0/genindex.html index 5473fc1..eb1258f 100644 --- a/4.0/genindex.html +++ b/4.0/genindex.html @@ -1202,5 +1202,5 @@

Z

}); - + \ No newline at end of file diff --git a/4.0/index.html b/4.0/index.html index c0d68f7..318cde2 100644 --- a/4.0/index.html +++ b/4.0/index.html @@ -189,5 +189,5 @@

Citations \ No newline at end of file diff --git a/4.0/notes/FAQ.html b/4.0/notes/FAQ.html index 54db1fc..a170bc3 100644 --- a/4.0/notes/FAQ.html +++ b/4.0/notes/FAQ.html @@ -136,5 +136,5 @@

FAQ - + \ No newline at end of file diff --git a/4.0/notes/command_line.html b/4.0/notes/command_line.html index e696291..e6b0cda 100644 --- a/4.0/notes/command_line.html +++ b/4.0/notes/command_line.html @@ -230,5 +230,5 @@

Command Line \ No newline at end of file diff --git a/4.0/notes/getting_started.html b/4.0/notes/getting_started.html index 376f35c..94cd16c 100644 --- a/4.0/notes/getting_started.html +++ b/4.0/notes/getting_started.html @@ -155,5 +155,5 @@

Tutorial Notebooks \ No newline at end of file diff --git a/4.0/notes/installation.html b/4.0/notes/installation.html index f62f4e2..e5bbe28 100644 --- a/4.0/notes/installation.html +++ b/4.0/notes/installation.html @@ -230,5 +230,5 @@

FAQ - + \ No newline at end of file diff --git a/4.0/notes/license.html b/4.0/notes/license.html index 5ca34f8..74a9b10 100644 --- a/4.0/notes/license.html +++ b/4.0/notes/license.html @@ -156,5 +156,5 @@

License \ No newline at end of file diff --git a/4.0/py-modindex.html b/4.0/py-modindex.html index 8dd1141..01c56e7 100644 --- a/4.0/py-modindex.html +++ b/4.0/py-modindex.html @@ -250,5 +250,5 @@

Python Module Index

}); - + \ No newline at end of file diff --git a/4.0/search.html b/4.0/search.html index 1f7913e..f881418 100644 --- a/4.0/search.html +++ b/4.0/search.html @@ -145,5 +145,5 @@ - + \ No newline at end of file diff --git a/5.0/_modules/index.html b/5.0/_modules/index.html index 9fb6b85..55effa6 100644 --- a/5.0/_modules/index.html +++ b/5.0/_modules/index.html @@ -235,5 +235,5 @@

All modules for which code is available

}); - + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/ancient/aksonhan.html b/5.0/_modules/pythainlp/ancient/aksonhan.html index 014f0ba..653db6d 100644 --- a/5.0/_modules/pythainlp/ancient/aksonhan.html +++ b/5.0/_modules/pythainlp/ancient/aksonhan.html @@ -203,5 +203,5 @@

Source code for pythainlp.ancient.aksonhan

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/augment/lm/fasttext.html b/5.0/_modules/pythainlp/augment/lm/fasttext.html
index ac1fce7..4fe9c0b 100644
--- a/5.0/_modules/pythainlp/augment/lm/fasttext.html
+++ b/5.0/_modules/pythainlp/augment/lm/fasttext.html
@@ -218,5 +218,5 @@ 

Source code for pythainlp.augment.lm.fasttext

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/lm/wangchanberta.html b/5.0/_modules/pythainlp/augment/lm/wangchanberta.html index e7998f0..006a167 100644 --- a/5.0/_modules/pythainlp/augment/lm/wangchanberta.html +++ b/5.0/_modules/pythainlp/augment/lm/wangchanberta.html @@ -218,5 +218,5 @@

Source code for pythainlp.augment.lm.wangchanberta

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html b/5.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html index 127cb3e..a7d8f4c 100644 --- a/5.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html +++ b/5.0/_modules/pythainlp/augment/word2vec/bpemb_wv.html @@ -205,5 +205,5 @@

Source code for pythainlp.augment.word2vec.bpemb_wv

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/word2vec/core.html b/5.0/_modules/pythainlp/augment/word2vec/core.html index bde8e6d..359e9d9 100644 --- a/5.0/_modules/pythainlp/augment/word2vec/core.html +++ b/5.0/_modules/pythainlp/augment/word2vec/core.html @@ -202,5 +202,5 @@

Source code for pythainlp.augment.word2vec.core

< }); - + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/word2vec/ltw2v.html b/5.0/_modules/pythainlp/augment/word2vec/ltw2v.html index 23c0807..277a383 100644 --- a/5.0/_modules/pythainlp/augment/word2vec/ltw2v.html +++ b/5.0/_modules/pythainlp/augment/word2vec/ltw2v.html @@ -195,5 +195,5 @@

Source code for pythainlp.augment.word2vec.ltw2v

}); - + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/word2vec/thai2fit.html b/5.0/_modules/pythainlp/augment/word2vec/thai2fit.html index 95ff885..1a68754 100644 --- a/5.0/_modules/pythainlp/augment/word2vec/thai2fit.html +++ b/5.0/_modules/pythainlp/augment/word2vec/thai2fit.html @@ -195,5 +195,5 @@

Source code for pythainlp.augment.word2vec.thai2fit

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/augment/wordnet.html b/5.0/_modules/pythainlp/augment/wordnet.html index e6f83ab..be75d74 100644 --- a/5.0/_modules/pythainlp/augment/wordnet.html +++ b/5.0/_modules/pythainlp/augment/wordnet.html @@ -354,5 +354,5 @@

Source code for pythainlp.augment.wordnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/benchmarks/word_tokenization.html b/5.0/_modules/pythainlp/benchmarks/word_tokenization.html
index f2e22a3..10a469d 100644
--- a/5.0/_modules/pythainlp/benchmarks/word_tokenization.html
+++ b/5.0/_modules/pythainlp/benchmarks/word_tokenization.html
@@ -411,5 +411,5 @@ 

Source code for pythainlp.benchmarks.word_tokenization

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/chat/core.html b/5.0/_modules/pythainlp/chat/core.html index f408cc9..4dea25c 100644 --- a/5.0/_modules/pythainlp/chat/core.html +++ b/5.0/_modules/pythainlp/chat/core.html @@ -231,5 +231,5 @@

Source code for pythainlp.chat.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/classify/param_free.html b/5.0/_modules/pythainlp/classify/param_free.html
index 4e21200..2109072 100644
--- a/5.0/_modules/pythainlp/classify/param_free.html
+++ b/5.0/_modules/pythainlp/classify/param_free.html
@@ -212,5 +212,5 @@ 

Source code for pythainlp.classify.param_free

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/coref/core.html b/5.0/_modules/pythainlp/coref/core.html index bda6735..4743944 100644 --- a/5.0/_modules/pythainlp/coref/core.html +++ b/5.0/_modules/pythainlp/coref/core.html @@ -195,5 +195,5 @@

Source code for pythainlp.coref.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/common.html b/5.0/_modules/pythainlp/corpus/common.html
index 5bcc632..d6923a6 100644
--- a/5.0/_modules/pythainlp/corpus/common.html
+++ b/5.0/_modules/pythainlp/corpus/common.html
@@ -513,5 +513,5 @@ 

Source code for pythainlp.corpus.common

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/conceptnet.html b/5.0/_modules/pythainlp/corpus/conceptnet.html
index e520ee8..16fce13 100644
--- a/5.0/_modules/pythainlp/corpus/conceptnet.html
+++ b/5.0/_modules/pythainlp/corpus/conceptnet.html
@@ -249,5 +249,5 @@ 

Source code for pythainlp.corpus.conceptnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/core.html b/5.0/_modules/pythainlp/corpus/core.html
index 70843b3..41547ba 100644
--- a/5.0/_modules/pythainlp/corpus/core.html
+++ b/5.0/_modules/pythainlp/corpus/core.html
@@ -743,5 +743,5 @@ 

Source code for pythainlp.corpus.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/oscar.html b/5.0/_modules/pythainlp/corpus/oscar.html
index d4f59d0..32411bc 100644
--- a/5.0/_modules/pythainlp/corpus/oscar.html
+++ b/5.0/_modules/pythainlp/corpus/oscar.html
@@ -195,5 +195,5 @@ 

Source code for pythainlp.corpus.oscar

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/th_en_translit.html b/5.0/_modules/pythainlp/corpus/th_en_translit.html
index 891e360..b244ba1 100644
--- a/5.0/_modules/pythainlp/corpus/th_en_translit.html
+++ b/5.0/_modules/pythainlp/corpus/th_en_translit.html
@@ -211,5 +211,5 @@ 

Source code for pythainlp.corpus.th_en_translit

< }); - + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/corpus/tnc.html b/5.0/_modules/pythainlp/corpus/tnc.html index 94b295f..2cc906d 100644 --- a/5.0/_modules/pythainlp/corpus/tnc.html +++ b/5.0/_modules/pythainlp/corpus/tnc.html @@ -222,5 +222,5 @@

Source code for pythainlp.corpus.tnc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/ttc.html b/5.0/_modules/pythainlp/corpus/ttc.html
index 8f55869..6aff61d 100644
--- a/5.0/_modules/pythainlp/corpus/ttc.html
+++ b/5.0/_modules/pythainlp/corpus/ttc.html
@@ -186,5 +186,5 @@ 

Source code for pythainlp.corpus.ttc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/util.html b/5.0/_modules/pythainlp/corpus/util.html
index c64ba27..c3e6964 100644
--- a/5.0/_modules/pythainlp/corpus/util.html
+++ b/5.0/_modules/pythainlp/corpus/util.html
@@ -281,5 +281,5 @@ 

Source code for pythainlp.corpus.util

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/corpus/wordnet.html b/5.0/_modules/pythainlp/corpus/wordnet.html
index 38239bd..933fca1 100644
--- a/5.0/_modules/pythainlp/corpus/wordnet.html
+++ b/5.0/_modules/pythainlp/corpus/wordnet.html
@@ -580,5 +580,5 @@ 

Source code for pythainlp.corpus.wordnet

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/el/core.html b/5.0/_modules/pythainlp/el/core.html
index a5da846..d7dcaab 100644
--- a/5.0/_modules/pythainlp/el/core.html
+++ b/5.0/_modules/pythainlp/el/core.html
@@ -187,5 +187,5 @@ 

Source code for pythainlp.el.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/generate/core.html b/5.0/_modules/pythainlp/generate/core.html
index ae7ffbd..3884468 100644
--- a/5.0/_modules/pythainlp/generate/core.html
+++ b/5.0/_modules/pythainlp/generate/core.html
@@ -434,5 +434,5 @@ 

Source code for pythainlp.generate.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/generate/thai2fit.html b/5.0/_modules/pythainlp/generate/thai2fit.html
index a226cf7..45e9a35 100644
--- a/5.0/_modules/pythainlp/generate/thai2fit.html
+++ b/5.0/_modules/pythainlp/generate/thai2fit.html
@@ -252,5 +252,5 @@ 

Source code for pythainlp.generate.thai2fit

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/generate/wangchanglm.html b/5.0/_modules/pythainlp/generate/wangchanglm.html
index 0f12794..be28e07 100644
--- a/5.0/_modules/pythainlp/generate/wangchanglm.html
+++ b/5.0/_modules/pythainlp/generate/wangchanglm.html
@@ -323,5 +323,5 @@ 

Source code for pythainlp.generate.wangchanglm

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/khavee/core.html b/5.0/_modules/pythainlp/khavee/core.html index f2d8a71..f068a9a 100644 --- a/5.0/_modules/pythainlp/khavee/core.html +++ b/5.0/_modules/pythainlp/khavee/core.html @@ -790,5 +790,5 @@

Source code for pythainlp.khavee.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/morpheme/thaiwordcheck.html b/5.0/_modules/pythainlp/morpheme/thaiwordcheck.html
index 7794c7b..f31d431 100644
--- a/5.0/_modules/pythainlp/morpheme/thaiwordcheck.html
+++ b/5.0/_modules/pythainlp/morpheme/thaiwordcheck.html
@@ -268,5 +268,5 @@ 

Source code for pythainlp.morpheme.thaiwordcheck

}); - + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/morpheme/word_formation.html b/5.0/_modules/pythainlp/morpheme/word_formation.html index 92c1cc1..0299568 100644 --- a/5.0/_modules/pythainlp/morpheme/word_formation.html +++ b/5.0/_modules/pythainlp/morpheme/word_formation.html @@ -196,5 +196,5 @@

Source code for pythainlp.morpheme.word_formation

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/parse/core.html b/5.0/_modules/pythainlp/parse/core.html index dc0560d..b826358 100644 --- a/5.0/_modules/pythainlp/parse/core.html +++ b/5.0/_modules/pythainlp/parse/core.html @@ -260,5 +260,5 @@

Source code for pythainlp.parse.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/phayathaibert/core.html b/5.0/_modules/pythainlp/phayathaibert/core.html
index 6176fe4..f67bbb1 100644
--- a/5.0/_modules/pythainlp/phayathaibert/core.html
+++ b/5.0/_modules/pythainlp/phayathaibert/core.html
@@ -587,5 +587,5 @@ 

Source code for pythainlp.phayathaibert.core

 
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/soundex/core.html b/5.0/_modules/pythainlp/soundex/core.html
index 214fdf6..159f7ed 100644
--- a/5.0/_modules/pythainlp/soundex/core.html
+++ b/5.0/_modules/pythainlp/soundex/core.html
@@ -218,5 +218,5 @@ 

Source code for pythainlp.soundex.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/soundex/lk82.html b/5.0/_modules/pythainlp/soundex/lk82.html
index 440d874..560c96d 100644
--- a/5.0/_modules/pythainlp/soundex/lk82.html
+++ b/5.0/_modules/pythainlp/soundex/lk82.html
@@ -268,5 +268,5 @@ 

Source code for pythainlp.soundex.lk82

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/soundex/metasound.html b/5.0/_modules/pythainlp/soundex/metasound.html
index 893acad..f345f46 100644
--- a/5.0/_modules/pythainlp/soundex/metasound.html
+++ b/5.0/_modules/pythainlp/soundex/metasound.html
@@ -245,5 +245,5 @@ 

Source code for pythainlp.soundex.metasound

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/soundex/prayut_and_somchaip.html b/5.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
index 1879e38..967c7c3 100644
--- a/5.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
+++ b/5.0/_modules/pythainlp/soundex/prayut_and_somchaip.html
@@ -226,5 +226,5 @@ 

Source code for pythainlp.soundex.prayut_and_somchaip

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/soundex/sound.html b/5.0/_modules/pythainlp/soundex/sound.html index 71d905b..13f4885 100644 --- a/5.0/_modules/pythainlp/soundex/sound.html +++ b/5.0/_modules/pythainlp/soundex/sound.html @@ -220,5 +220,5 @@

Source code for pythainlp.soundex.sound

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/soundex/udom83.html b/5.0/_modules/pythainlp/soundex/udom83.html
index 5a019df..22a2497 100644
--- a/5.0/_modules/pythainlp/soundex/udom83.html
+++ b/5.0/_modules/pythainlp/soundex/udom83.html
@@ -240,5 +240,5 @@ 

Source code for pythainlp.soundex.udom83

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/spell/core.html b/5.0/_modules/pythainlp/spell/core.html
index d9e7591..8e1921f 100644
--- a/5.0/_modules/pythainlp/spell/core.html
+++ b/5.0/_modules/pythainlp/spell/core.html
@@ -329,5 +329,5 @@ 

Source code for pythainlp.spell.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/spell/pn.html b/5.0/_modules/pythainlp/spell/pn.html
index edb6ff0..2cffb47 100644
--- a/5.0/_modules/pythainlp/spell/pn.html
+++ b/5.0/_modules/pythainlp/spell/pn.html
@@ -510,5 +510,5 @@ 

Source code for pythainlp.spell.pn

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/summarize/core.html b/5.0/_modules/pythainlp/summarize/core.html
index 7654ae0..13ebc6f 100644
--- a/5.0/_modules/pythainlp/summarize/core.html
+++ b/5.0/_modules/pythainlp/summarize/core.html
@@ -389,5 +389,5 @@ 

Source code for pythainlp.summarize.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/summarize/keybert.html b/5.0/_modules/pythainlp/summarize/keybert.html
index 1933c88..7f60ea1 100644
--- a/5.0/_modules/pythainlp/summarize/keybert.html
+++ b/5.0/_modules/pythainlp/summarize/keybert.html
@@ -366,5 +366,5 @@ 

Source code for pythainlp.summarize.keybert

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/chunk.html b/5.0/_modules/pythainlp/tag/chunk.html
index abf0520..67246f8 100644
--- a/5.0/_modules/pythainlp/tag/chunk.html
+++ b/5.0/_modules/pythainlp/tag/chunk.html
@@ -172,5 +172,5 @@ 

Source code for pythainlp.tag.chunk

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/locations.html b/5.0/_modules/pythainlp/tag/locations.html
index e2d7cfa..edc6c94 100644
--- a/5.0/_modules/pythainlp/tag/locations.html
+++ b/5.0/_modules/pythainlp/tag/locations.html
@@ -175,5 +175,5 @@ 

Source code for pythainlp.tag.locations

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/named_entity.html b/5.0/_modules/pythainlp/tag/named_entity.html
index ecac29b..bc2fa51 100644
--- a/5.0/_modules/pythainlp/tag/named_entity.html
+++ b/5.0/_modules/pythainlp/tag/named_entity.html
@@ -303,5 +303,5 @@ 

Source code for pythainlp.tag.named_entity

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/pos_tag.html b/5.0/_modules/pythainlp/tag/pos_tag.html
index 1fa5bbf..373ab68 100644
--- a/5.0/_modules/pythainlp/tag/pos_tag.html
+++ b/5.0/_modules/pythainlp/tag/pos_tag.html
@@ -389,5 +389,5 @@ 

Source code for pythainlp.tag.pos_tag

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/thainer.html b/5.0/_modules/pythainlp/tag/thainer.html
index f930e38..d3de92d 100644
--- a/5.0/_modules/pythainlp/tag/thainer.html
+++ b/5.0/_modules/pythainlp/tag/thainer.html
@@ -344,5 +344,5 @@ 

Source code for pythainlp.tag.thainer

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tag/tltk.html b/5.0/_modules/pythainlp/tag/tltk.html
index 47e193a..15072f9 100644
--- a/5.0/_modules/pythainlp/tag/tltk.html
+++ b/5.0/_modules/pythainlp/tag/tltk.html
@@ -234,5 +234,5 @@ 

Source code for pythainlp.tag.tltk

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/attacut.html b/5.0/_modules/pythainlp/tokenize/attacut.html
index 33d8802..96e39d0 100644
--- a/5.0/_modules/pythainlp/tokenize/attacut.html
+++ b/5.0/_modules/pythainlp/tokenize/attacut.html
@@ -182,5 +182,5 @@ 

Source code for pythainlp.tokenize.attacut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/core.html b/5.0/_modules/pythainlp/tokenize/core.html
index dc820af..b97edbb 100644
--- a/5.0/_modules/pythainlp/tokenize/core.html
+++ b/5.0/_modules/pythainlp/tokenize/core.html
@@ -931,5 +931,5 @@ 

Source code for pythainlp.tokenize.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/crfcut.html b/5.0/_modules/pythainlp/tokenize/crfcut.html
index 73cf9e9..185ee31 100644
--- a/5.0/_modules/pythainlp/tokenize/crfcut.html
+++ b/5.0/_modules/pythainlp/tokenize/crfcut.html
@@ -357,5 +357,5 @@ 

Source code for pythainlp.tokenize.crfcut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/deepcut.html b/5.0/_modules/pythainlp/tokenize/deepcut.html
index c4c96f6..7b364f4 100644
--- a/5.0/_modules/pythainlp/tokenize/deepcut.html
+++ b/5.0/_modules/pythainlp/tokenize/deepcut.html
@@ -173,5 +173,5 @@ 

Source code for pythainlp.tokenize.deepcut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/etcc.html b/5.0/_modules/pythainlp/tokenize/etcc.html
index 1631668..65170d0 100644
--- a/5.0/_modules/pythainlp/tokenize/etcc.html
+++ b/5.0/_modules/pythainlp/tokenize/etcc.html
@@ -204,5 +204,5 @@ 

Source code for pythainlp.tokenize.etcc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/han_solo.html b/5.0/_modules/pythainlp/tokenize/han_solo.html
index 25dbe1b..955d098 100644
--- a/5.0/_modules/pythainlp/tokenize/han_solo.html
+++ b/5.0/_modules/pythainlp/tokenize/han_solo.html
@@ -268,5 +268,5 @@ 

Source code for pythainlp.tokenize.han_solo

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/longest.html b/5.0/_modules/pythainlp/tokenize/longest.html
index cf4d002..bc6d053 100644
--- a/5.0/_modules/pythainlp/tokenize/longest.html
+++ b/5.0/_modules/pythainlp/tokenize/longest.html
@@ -297,5 +297,5 @@ 

Source code for pythainlp.tokenize.longest

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/multi_cut.html b/5.0/_modules/pythainlp/tokenize/multi_cut.html
index 4f34105..b96f11d 100644
--- a/5.0/_modules/pythainlp/tokenize/multi_cut.html
+++ b/5.0/_modules/pythainlp/tokenize/multi_cut.html
@@ -299,5 +299,5 @@ 

Source code for pythainlp.tokenize.multi_cut

 
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/nercut.html b/5.0/_modules/pythainlp/tokenize/nercut.html
index 47e2110..129660e 100644
--- a/5.0/_modules/pythainlp/tokenize/nercut.html
+++ b/5.0/_modules/pythainlp/tokenize/nercut.html
@@ -216,5 +216,5 @@ 

Source code for pythainlp.tokenize.nercut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/newmm.html b/5.0/_modules/pythainlp/tokenize/newmm.html
index adab7ff..f9107ab 100644
--- a/5.0/_modules/pythainlp/tokenize/newmm.html
+++ b/5.0/_modules/pythainlp/tokenize/newmm.html
@@ -350,5 +350,5 @@ 

Source code for pythainlp.tokenize.newmm

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/nlpo3.html b/5.0/_modules/pythainlp/tokenize/nlpo3.html
index 5fe2860..44373c0 100644
--- a/5.0/_modules/pythainlp/tokenize/nlpo3.html
+++ b/5.0/_modules/pythainlp/tokenize/nlpo3.html
@@ -207,5 +207,5 @@ 

Source code for pythainlp.tokenize.nlpo3

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/oskut.html b/5.0/_modules/pythainlp/tokenize/oskut.html
index fd8f8ad..a69f73c 100644
--- a/5.0/_modules/pythainlp/tokenize/oskut.html
+++ b/5.0/_modules/pythainlp/tokenize/oskut.html
@@ -165,5 +165,5 @@ 

Source code for pythainlp.tokenize.oskut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/pyicu.html b/5.0/_modules/pythainlp/tokenize/pyicu.html
index 7244fb8..eb3561f 100644
--- a/5.0/_modules/pythainlp/tokenize/pyicu.html
+++ b/5.0/_modules/pythainlp/tokenize/pyicu.html
@@ -175,5 +175,5 @@ 

Source code for pythainlp.tokenize.pyicu

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/sefr_cut.html b/5.0/_modules/pythainlp/tokenize/sefr_cut.html
index a3edda8..c6351ed 100644
--- a/5.0/_modules/pythainlp/tokenize/sefr_cut.html
+++ b/5.0/_modules/pythainlp/tokenize/sefr_cut.html
@@ -164,5 +164,5 @@ 

Source code for pythainlp.tokenize.sefr_cut

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/tcc.html b/5.0/_modules/pythainlp/tokenize/tcc.html
index 2248650..2c54006 100644
--- a/5.0/_modules/pythainlp/tokenize/tcc.html
+++ b/5.0/_modules/pythainlp/tokenize/tcc.html
@@ -253,5 +253,5 @@ 

Source code for pythainlp.tokenize.tcc

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/tcc_p.html b/5.0/_modules/pythainlp/tokenize/tcc_p.html
index 5a2bcb4..7293840 100644
--- a/5.0/_modules/pythainlp/tokenize/tcc_p.html
+++ b/5.0/_modules/pythainlp/tokenize/tcc_p.html
@@ -253,5 +253,5 @@ 

Source code for pythainlp.tokenize.tcc_p

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tokenize/thaisumcut.html b/5.0/_modules/pythainlp/tokenize/thaisumcut.html
index 008c5a4..ec07ea7 100644
--- a/5.0/_modules/pythainlp/tokenize/thaisumcut.html
+++ b/5.0/_modules/pythainlp/tokenize/thaisumcut.html
@@ -516,5 +516,5 @@ 

Source code for pythainlp.tokenize.thaisumcut

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/tools/misspell.html b/5.0/_modules/pythainlp/tools/misspell.html index 90e9b63..2494b08 100644 --- a/5.0/_modules/pythainlp/tools/misspell.html +++ b/5.0/_modules/pythainlp/tools/misspell.html @@ -281,5 +281,5 @@

Source code for pythainlp.tools.misspell

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/tools/path.html b/5.0/_modules/pythainlp/tools/path.html
index 7e18114..99fd7fe 100644
--- a/5.0/_modules/pythainlp/tools/path.html
+++ b/5.0/_modules/pythainlp/tools/path.html
@@ -215,5 +215,5 @@ 

Source code for pythainlp.tools.path

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/translate/core.html b/5.0/_modules/pythainlp/translate/core.html
index 71274a4..7dcf27f 100644
--- a/5.0/_modules/pythainlp/translate/core.html
+++ b/5.0/_modules/pythainlp/translate/core.html
@@ -229,5 +229,5 @@ 

Source code for pythainlp.translate.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/translate/en_th.html b/5.0/_modules/pythainlp/translate/en_th.html
index 46a36e4..e18da00 100644
--- a/5.0/_modules/pythainlp/translate/en_th.html
+++ b/5.0/_modules/pythainlp/translate/en_th.html
@@ -299,5 +299,5 @@ 

Source code for pythainlp.translate.en_th

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/translate/th_fr.html b/5.0/_modules/pythainlp/translate/th_fr.html
index 9687147..98c2430 100644
--- a/5.0/_modules/pythainlp/translate/th_fr.html
+++ b/5.0/_modules/pythainlp/translate/th_fr.html
@@ -207,5 +207,5 @@ 

Source code for pythainlp.translate.th_fr

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/translate/zh_th.html b/5.0/_modules/pythainlp/translate/zh_th.html
index dc3c21e..645a942 100644
--- a/5.0/_modules/pythainlp/translate/zh_th.html
+++ b/5.0/_modules/pythainlp/translate/zh_th.html
@@ -252,5 +252,5 @@ 

Source code for pythainlp.translate.zh_th

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/transliterate/core.html b/5.0/_modules/pythainlp/transliterate/core.html
index f09a8e1..ae8b6c9 100644
--- a/5.0/_modules/pythainlp/transliterate/core.html
+++ b/5.0/_modules/pythainlp/transliterate/core.html
@@ -337,5 +337,5 @@ 

Source code for pythainlp.transliterate.core

 
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/transliterate/spoonerism.html b/5.0/_modules/pythainlp/transliterate/spoonerism.html
index d787cf7..9d94423 100644
--- a/5.0/_modules/pythainlp/transliterate/spoonerism.html
+++ b/5.0/_modules/pythainlp/transliterate/spoonerism.html
@@ -216,5 +216,5 @@ 

Source code for pythainlp.transliterate.spoonerism

\ No newline at end of file diff --git a/5.0/_modules/pythainlp/transliterate/wunsen.html b/5.0/_modules/pythainlp/transliterate/wunsen.html index cb19a50..9d2eaa9 100644 --- a/5.0/_modules/pythainlp/transliterate/wunsen.html +++ b/5.0/_modules/pythainlp/transliterate/wunsen.html @@ -284,5 +284,5 @@

Source code for pythainlp.transliterate.wunsen

- + \ No newline at end of file diff --git a/5.0/_modules/pythainlp/ulmfit/core.html b/5.0/_modules/pythainlp/ulmfit/core.html index d9a14b5..b741e50 100644 --- a/5.0/_modules/pythainlp/ulmfit/core.html +++ b/5.0/_modules/pythainlp/ulmfit/core.html @@ -396,5 +396,5 @@

Source code for pythainlp.ulmfit.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/ulmfit/preprocess.html b/5.0/_modules/pythainlp/ulmfit/preprocess.html
index 6df1597..6e94748 100644
--- a/5.0/_modules/pythainlp/ulmfit/preprocess.html
+++ b/5.0/_modules/pythainlp/ulmfit/preprocess.html
@@ -428,5 +428,5 @@ 

Source code for pythainlp.ulmfit.preprocess

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/ulmfit/tokenizer.html b/5.0/_modules/pythainlp/ulmfit/tokenizer.html
index 5479783..c6d66c3 100644
--- a/5.0/_modules/pythainlp/ulmfit/tokenizer.html
+++ b/5.0/_modules/pythainlp/ulmfit/tokenizer.html
@@ -207,5 +207,5 @@ 

Source code for pythainlp.ulmfit.tokenizer

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/abbreviation.html b/5.0/_modules/pythainlp/util/abbreviation.html
index 7e7417c..34daaa7 100644
--- a/5.0/_modules/pythainlp/util/abbreviation.html
+++ b/5.0/_modules/pythainlp/util/abbreviation.html
@@ -183,5 +183,5 @@ 

Source code for pythainlp.util.abbreviation

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/collate.html b/5.0/_modules/pythainlp/util/collate.html
index c355efb..7f91ef5 100644
--- a/5.0/_modules/pythainlp/util/collate.html
+++ b/5.0/_modules/pythainlp/util/collate.html
@@ -189,5 +189,5 @@ 

Source code for pythainlp.util.collate

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/date.html b/5.0/_modules/pythainlp/util/date.html
index ae12cb1..c0b5679 100644
--- a/5.0/_modules/pythainlp/util/date.html
+++ b/5.0/_modules/pythainlp/util/date.html
@@ -535,5 +535,5 @@ 

Source code for pythainlp.util.date

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/digitconv.html b/5.0/_modules/pythainlp/util/digitconv.html
index 5467ae9..046d473 100644
--- a/5.0/_modules/pythainlp/util/digitconv.html
+++ b/5.0/_modules/pythainlp/util/digitconv.html
@@ -337,5 +337,5 @@ 

Source code for pythainlp.util.digitconv

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/emojiconv.html b/5.0/_modules/pythainlp/util/emojiconv.html
index 1e4857f..1a4ac0b 100644
--- a/5.0/_modules/pythainlp/util/emojiconv.html
+++ b/5.0/_modules/pythainlp/util/emojiconv.html
@@ -2001,5 +2001,5 @@ 

Source code for pythainlp.util.emojiconv

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/encoding.html b/5.0/_modules/pythainlp/util/encoding.html
index 9250eb7..a9e198d 100644
--- a/5.0/_modules/pythainlp/util/encoding.html
+++ b/5.0/_modules/pythainlp/util/encoding.html
@@ -177,5 +177,5 @@ 

Source code for pythainlp.util.encoding

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/keyboard.html b/5.0/_modules/pythainlp/util/keyboard.html
index dcfde42..d075da4 100644
--- a/5.0/_modules/pythainlp/util/keyboard.html
+++ b/5.0/_modules/pythainlp/util/keyboard.html
@@ -364,5 +364,5 @@ 

Source code for pythainlp.util.keyboard

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/keywords.html b/5.0/_modules/pythainlp/util/keywords.html
index 41965be..ce8a2fd 100644
--- a/5.0/_modules/pythainlp/util/keywords.html
+++ b/5.0/_modules/pythainlp/util/keywords.html
@@ -241,5 +241,5 @@ 

Source code for pythainlp.util.keywords

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/morse.html b/5.0/_modules/pythainlp/util/morse.html
index a7e5509..8ee32bb 100644
--- a/5.0/_modules/pythainlp/util/morse.html
+++ b/5.0/_modules/pythainlp/util/morse.html
@@ -337,5 +337,5 @@ 

Source code for pythainlp.util.morse

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/normalize.html b/5.0/_modules/pythainlp/util/normalize.html
index afbd4fc..094502c 100644
--- a/5.0/_modules/pythainlp/util/normalize.html
+++ b/5.0/_modules/pythainlp/util/normalize.html
@@ -429,5 +429,5 @@ 

Source code for pythainlp.util.normalize

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/numtoword.html b/5.0/_modules/pythainlp/util/numtoword.html
index 3a8aae4..a990d43 100644
--- a/5.0/_modules/pythainlp/util/numtoword.html
+++ b/5.0/_modules/pythainlp/util/numtoword.html
@@ -259,5 +259,5 @@ 

Source code for pythainlp.util.numtoword

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/phoneme.html b/5.0/_modules/pythainlp/util/phoneme.html
index 4a20408..c632fc0 100644
--- a/5.0/_modules/pythainlp/util/phoneme.html
+++ b/5.0/_modules/pythainlp/util/phoneme.html
@@ -395,5 +395,5 @@ 

Source code for pythainlp.util.phoneme

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/pronounce.html b/5.0/_modules/pythainlp/util/pronounce.html
index 94b3790..864b35e 100644
--- a/5.0/_modules/pythainlp/util/pronounce.html
+++ b/5.0/_modules/pythainlp/util/pronounce.html
@@ -177,5 +177,5 @@ 

Source code for pythainlp.util.pronounce

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/spell_words.html b/5.0/_modules/pythainlp/util/spell_words.html
index ba67721..b4d7288 100644
--- a/5.0/_modules/pythainlp/util/spell_words.html
+++ b/5.0/_modules/pythainlp/util/spell_words.html
@@ -267,5 +267,5 @@ 

Source code for pythainlp.util.spell_words

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/strftime.html b/5.0/_modules/pythainlp/util/strftime.html
index 31c4cb0..a315c79 100644
--- a/5.0/_modules/pythainlp/util/strftime.html
+++ b/5.0/_modules/pythainlp/util/strftime.html
@@ -476,5 +476,5 @@ 

Source code for pythainlp.util.strftime

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/syllable.html b/5.0/_modules/pythainlp/util/syllable.html
index 33563fe..a244b6c 100644
--- a/5.0/_modules/pythainlp/util/syllable.html
+++ b/5.0/_modules/pythainlp/util/syllable.html
@@ -455,5 +455,5 @@ 

Source code for pythainlp.util.syllable

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/thai.html b/5.0/_modules/pythainlp/util/thai.html
index 549ae7d..866acf5 100644
--- a/5.0/_modules/pythainlp/util/thai.html
+++ b/5.0/_modules/pythainlp/util/thai.html
@@ -406,5 +406,5 @@ 

Source code for pythainlp.util.thai

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/time.html b/5.0/_modules/pythainlp/util/time.html
index 140302c..df13bae 100644
--- a/5.0/_modules/pythainlp/util/time.html
+++ b/5.0/_modules/pythainlp/util/time.html
@@ -465,5 +465,5 @@ 

Source code for pythainlp.util.time

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/trie.html b/5.0/_modules/pythainlp/util/trie.html
index 10ea8dd..9545075 100644
--- a/5.0/_modules/pythainlp/util/trie.html
+++ b/5.0/_modules/pythainlp/util/trie.html
@@ -267,5 +267,5 @@ 

Source code for pythainlp.util.trie

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/util/wordtonum.html b/5.0/_modules/pythainlp/util/wordtonum.html
index ec42882..f4a0b75 100644
--- a/5.0/_modules/pythainlp/util/wordtonum.html
+++ b/5.0/_modules/pythainlp/util/wordtonum.html
@@ -354,5 +354,5 @@ 

Source code for pythainlp.util.wordtonum

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/wangchanberta/core.html b/5.0/_modules/pythainlp/wangchanberta/core.html
index 4afd5d1..727bbb8 100644
--- a/5.0/_modules/pythainlp/wangchanberta/core.html
+++ b/5.0/_modules/pythainlp/wangchanberta/core.html
@@ -375,5 +375,5 @@ 

Source code for pythainlp.wangchanberta.core

 
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/word_vector/core.html b/5.0/_modules/pythainlp/word_vector/core.html
index 7b089c2..8c5085a 100644
--- a/5.0/_modules/pythainlp/word_vector/core.html
+++ b/5.0/_modules/pythainlp/word_vector/core.html
@@ -458,5 +458,5 @@ 

Source code for pythainlp.word_vector.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/_modules/pythainlp/wsd/core.html b/5.0/_modules/pythainlp/wsd/core.html
index 011a8dd..1c7c150 100644
--- a/5.0/_modules/pythainlp/wsd/core.html
+++ b/5.0/_modules/pythainlp/wsd/core.html
@@ -266,5 +266,5 @@ 

Source code for pythainlp.wsd.core

       });
    
 
-
+
 
\ No newline at end of file
diff --git a/5.0/api/ancient.html b/5.0/api/ancient.html
index ce81a8c..39fb2f6 100644
--- a/5.0/api/ancient.html
+++ b/5.0/api/ancient.html
@@ -188,5 +188,5 @@ 

Modules \ No newline at end of file diff --git a/5.0/api/augment.html b/5.0/api/augment.html index 285eefa..ba1228c 100644 --- a/5.0/api/augment.html +++ b/5.0/api/augment.html @@ -678,5 +678,5 @@

Additional Functions - + \ No newline at end of file diff --git a/5.0/api/benchmarks.html b/5.0/api/benchmarks.html index 4d41bb7..bbdb6cd 100644 --- a/5.0/api/benchmarks.html +++ b/5.0/api/benchmarks.html @@ -262,5 +262,5 @@

Usage }); - + \ No newline at end of file diff --git a/5.0/api/chat.html b/5.0/api/chat.html index 5231674..3272be7 100644 --- a/5.0/api/chat.html +++ b/5.0/api/chat.html @@ -223,5 +223,5 @@

pythainlp.chat \ No newline at end of file diff --git a/5.0/api/classify.html b/5.0/api/classify.html index b09644a..6dd1a4b 100644 --- a/5.0/api/classify.html +++ b/5.0/api/classify.html @@ -216,5 +216,5 @@

pythainlp.classify \ No newline at end of file diff --git a/5.0/api/coref.html b/5.0/api/coref.html index b113962..7628ffb 100644 --- a/5.0/api/coref.html +++ b/5.0/api/coref.html @@ -223,5 +223,5 @@

Usage }); - + \ No newline at end of file diff --git a/5.0/api/corpus.html b/5.0/api/corpus.html index c1b425f..18bf378 100644 --- a/5.0/api/corpus.html +++ b/5.0/api/corpus.html @@ -1693,5 +1693,5 @@

Synset - + \ No newline at end of file diff --git a/5.0/api/el.html b/5.0/api/el.html index f9bfd93..97aa873 100644 --- a/5.0/api/el.html +++ b/5.0/api/el.html @@ -225,5 +225,5 @@

Example \ No newline at end of file diff --git a/5.0/api/generate.html b/5.0/api/generate.html index d58b8e4..08152cd 100644 --- a/5.0/api/generate.html +++ b/5.0/api/generate.html @@ -545,5 +545,5 @@

Example \ No newline at end of file diff --git a/5.0/api/khavee.html b/5.0/api/khavee.html index 37e4fb6..d31cebd 100644 --- a/5.0/api/khavee.html +++ b/5.0/api/khavee.html @@ -385,5 +385,5 @@

Example \ No newline at end of file diff --git a/5.0/api/morpheme.html b/5.0/api/morpheme.html index 984b0a1..1ad7c43 100644 --- a/5.0/api/morpheme.html +++ b/5.0/api/morpheme.html @@ -227,5 +227,5 @@

pythainlp.morpheme \ No newline at end of file diff --git a/5.0/api/parse.html b/5.0/api/parse.html index 8dda4b8..65e38ef 100644 --- a/5.0/api/parse.html +++ b/5.0/api/parse.html @@ -260,5 +260,5 @@

Example \ No newline at end of file diff --git a/5.0/api/phayathaibert.html b/5.0/api/phayathaibert.html index 5eb9495..882fb4e 100644 --- a/5.0/api/phayathaibert.html +++ b/5.0/api/phayathaibert.html @@ -474,5 +474,5 @@

Modules \ No newline at end of file diff --git a/5.0/api/soundex.html b/5.0/api/soundex.html index a7a9f94..62b5f33 100644 --- a/5.0/api/soundex.html +++ b/5.0/api/soundex.html @@ -540,5 +540,5 @@

References \ No newline at end of file diff --git a/5.0/api/spell.html b/5.0/api/spell.html index bdb03b6..ffbd61f 100644 --- a/5.0/api/spell.html +++ b/5.0/api/spell.html @@ -655,5 +655,5 @@

References \ No newline at end of file diff --git a/5.0/api/summarize.html b/5.0/api/summarize.html index 029dfa3..8fda89b 100644 --- a/5.0/api/summarize.html +++ b/5.0/api/summarize.html @@ -504,5 +504,5 @@

Keyword Extraction Engines \ No newline at end of file diff --git a/5.0/api/tag.html b/5.0/api/tag.html index 0e6ff18..2cb20e2 100644 --- a/5.0/api/tag.html +++ b/5.0/api/tag.html @@ -1133,5 +1133,5 @@

References \ No newline at end of file diff --git a/5.0/api/tokenize.html b/5.0/api/tokenize.html index d96e781..4b5fb65 100644 --- a/5.0/api/tokenize.html +++ b/5.0/api/tokenize.html @@ -1417,5 +1417,5 @@

Subword level \ No newline at end of file diff --git a/5.0/api/tools.html b/5.0/api/tools.html index 637c1d8..63ab425 100644 --- a/5.0/api/tools.html +++ b/5.0/api/tools.html @@ -261,5 +261,5 @@

Modules \ No newline at end of file diff --git a/5.0/api/translate.html b/5.0/api/translate.html index 316fe0d..a61014f 100644 --- a/5.0/api/translate.html +++ b/5.0/api/translate.html @@ -491,5 +491,5 @@

Modules \ No newline at end of file diff --git a/5.0/api/transliterate.html b/5.0/api/transliterate.html index dc53f90..d652e9e 100644 --- a/5.0/api/transliterate.html +++ b/5.0/api/transliterate.html @@ -533,5 +533,5 @@

References \ No newline at end of file diff --git a/5.0/api/ulmfit.html b/5.0/api/ulmfit.html index d11de53..4c573ae 100644 --- a/5.0/api/ulmfit.html +++ b/5.0/api/ulmfit.html @@ -594,5 +594,5 @@

Modules \ No newline at end of file diff --git a/5.0/api/util.html b/5.0/api/util.html index d966c1f..bc03785 100644 --- a/5.0/api/util.html +++ b/5.0/api/util.html @@ -2059,5 +2059,5 @@

References \ No newline at end of file diff --git a/5.0/api/wangchanberta.html b/5.0/api/wangchanberta.html index b6a363f..d367b73 100644 --- a/5.0/api/wangchanberta.html +++ b/5.0/api/wangchanberta.html @@ -299,5 +299,5 @@

References \ No newline at end of file diff --git a/5.0/api/word_vector.html b/5.0/api/word_vector.html index d92ab14..cbd9370 100644 --- a/5.0/api/word_vector.html +++ b/5.0/api/word_vector.html @@ -509,5 +509,5 @@

References \ No newline at end of file diff --git a/5.0/api/wsd.html b/5.0/api/wsd.html index b14368b..28458d2 100644 --- a/5.0/api/wsd.html +++ b/5.0/api/wsd.html @@ -205,5 +205,5 @@

Modules \ No newline at end of file diff --git a/5.0/genindex.html b/5.0/genindex.html index 3d47baa..d5b3378 100644 --- a/5.0/genindex.html +++ b/5.0/genindex.html @@ -1036,5 +1036,5 @@

Z

}); - + \ No newline at end of file diff --git a/5.0/index.html b/5.0/index.html index c493111..3ea2e4e 100644 --- a/5.0/index.html +++ b/5.0/index.html @@ -209,5 +209,5 @@

Citations \ No newline at end of file diff --git a/5.0/notes/FAQ.html b/5.0/notes/FAQ.html index b8db35f..5334883 100644 --- a/5.0/notes/FAQ.html +++ b/5.0/notes/FAQ.html @@ -148,5 +148,5 @@

FAQ - + \ No newline at end of file diff --git a/5.0/notes/command_line.html b/5.0/notes/command_line.html index 5c56bd0..dd9a7df 100644 --- a/5.0/notes/command_line.html +++ b/5.0/notes/command_line.html @@ -242,5 +242,5 @@

Command Line \ No newline at end of file diff --git a/5.0/notes/getting_started.html b/5.0/notes/getting_started.html index 7347f02..73a6d91 100644 --- a/5.0/notes/getting_started.html +++ b/5.0/notes/getting_started.html @@ -167,5 +167,5 @@

Tutorial Notebooks \ No newline at end of file diff --git a/5.0/notes/installation.html b/5.0/notes/installation.html index 958786a..9b6c262 100644 --- a/5.0/notes/installation.html +++ b/5.0/notes/installation.html @@ -247,5 +247,5 @@

FAQ - + \ No newline at end of file diff --git a/5.0/notes/license.html b/5.0/notes/license.html index b3f5a4d..1b5607a 100644 --- a/5.0/notes/license.html +++ b/5.0/notes/license.html @@ -168,5 +168,5 @@

License \ No newline at end of file diff --git a/5.0/py-modindex.html b/5.0/py-modindex.html index 2eecafe..8644e44 100644 --- a/5.0/py-modindex.html +++ b/5.0/py-modindex.html @@ -252,5 +252,5 @@

Python Module Index

}); - + \ No newline at end of file diff --git a/5.0/search.html b/5.0/search.html index 9fc0a68..2736f85 100644 --- a/5.0/search.html +++ b/5.0/search.html @@ -157,5 +157,5 @@ - + \ No newline at end of file diff --git a/5.1/.buildinfo b/5.1/.buildinfo new file mode 100644 index 0000000..33f325f --- /dev/null +++ b/5.1/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: f435f8948ee2bf8cfc09961b72395a36 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/5.1/_images/evaluation.png b/5.1/_images/evaluation.png new file mode 100644 index 0000000..3ac1cd1 Binary files /dev/null and b/5.1/_images/evaluation.png differ diff --git a/5.1/_images/logo.png b/5.1/_images/logo.png new file mode 100644 index 0000000..c5e5b00 Binary files /dev/null and b/5.1/_images/logo.png differ diff --git a/5.1/_modules/index.html b/5.1/_modules/index.html new file mode 100644 index 0000000..ff25279 --- /dev/null +++ b/5.1/_modules/index.html @@ -0,0 +1,237 @@ + + + + + + + + Overview: module code — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

All modules for which code is available

+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/ancient/aksonhan.html b/5.1/_modules/pythainlp/ancient/aksonhan.html new file mode 100644 index 0000000..0bebc40 --- /dev/null +++ b/5.1/_modules/pythainlp/ancient/aksonhan.html @@ -0,0 +1,207 @@ + + + + + + + + pythainlp.ancient.aksonhan — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.ancient.aksonhan

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+from pythainlp import thai_consonants, thai_tonemarks
+from pythainlp.corpus import thai_orst_words
+from pythainlp.tokenize import Tokenizer
+from pythainlp.util import Trie
+
+_dict_aksonhan = {}
+for i in list(thai_consonants):
+    if i == "ร":
+        continue
+    for j in list(thai_tonemarks):
+        _dict_aksonhan[i + j + i] = "ั" + j + i
+        _dict_aksonhan[i + i + j + i] = i + "ั" + j + i
+    _dict_aksonhan[i + i] = "ั" + i
+_set_aksonhan = set(_dict_aksonhan.keys())
+_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
+_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
+_dict_thai = set(thai_orst_words())  # call Thai words
+
+
+
+[docs] +def aksonhan_to_current(word: str) -> str: + """ + Convert AksonHan words to current Thai words + + AksonHan (อักษรหัน) writes down two consonants for the \ + spelling of the /a/ vowels. (สระ อะ). + + Today, รร is an aksonHan word that is still used in Thai. + + :param str word: Thai word + :return: Thai AksonHan to be converted to current Thai word + :rtype: str + + :Example: + :: + + from pythainlp.ancient import aksonhan_to_current + + print(aksonhan_to_current("จกก")) + # output: จัก + + print(aksonhan_to_current("บงงคบบ")) + # output: บังคับ + + print(aksonhan_to_current("สรรเพชญ")) # รร is still used. + # output: สรรเพชญ + """ + if len(word) < 3: + return word + elif word in _set_aksonhan: + return _dict_aksonhan[word] + elif word in _dict_thai: # word in Thai words + return word + + _seg = _tokenizer.word_tokenize(word) + _w = [] + for i in _seg: + if i in _set_aksonhan: + _w.append(_dict_aksonhan[i]) + else: + _w.append(i) + return "".join(_w)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/lm/fasttext.html b/5.1/_modules/pythainlp/augment/lm/fasttext.html new file mode 100644 index 0000000..d388c4d --- /dev/null +++ b/5.1/_modules/pythainlp/augment/lm/fasttext.html @@ -0,0 +1,236 @@ + + + + + + + + pythainlp.augment.lm.fasttext — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.lm.fasttext

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import itertools
+from typing import List, Tuple
+
+from gensim.models.fasttext import FastText as FastText_gensim
+from gensim.models.keyedvectors import KeyedVectors
+
+from pythainlp.tokenize import word_tokenize
+
+
+
+[docs] +class FastTextAug: + """ + Text Augment from fastText + + :param str model_path: path of model file + """ + +
+[docs] + def __init__(self, model_path: str): + """ + :param str model_path: path of model file + """ + if model_path.endswith(".bin"): + self.model = FastText_gensim.load_facebook_vectors(model_path) + elif model_path.endswith(".vec"): + self.model = KeyedVectors.load_word2vec_format(model_path) + else: + self.model = FastText_gensim.load(model_path) + self.dict_wv = list(self.model.key_to_index.keys())
+ + +
+[docs] + def tokenize(self, text: str) -> List[str]: + """ + Thai text tokenization for fastText + + :param str text: Thai text + + :return: list of words + :rtype: List[str] + """ + return word_tokenize(text, engine="icu")
+ + +
+[docs] + def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: + """ + :param str sent: text of sentence + :param float p: probability + :rtype: List[List[str]] + """ + list_sent_new = [] + for i in sent: + if i in self.dict_wv: + w = [j for j, v in self.model.most_similar(i) if v >= p] + if w == []: + list_sent_new.append([i]) + else: + list_sent_new.append(w) + else: + list_sent_new.append([i]) + return list_sent_new
+ + +
+[docs] + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment from fastText + + You may want to download the Thai model + from https://fasttext.cc/docs/en/crawl-vectors.html. + + :param str sentence: Thai sentence + :param int n_sent: number of sentences + :param float p: probability of word + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + self.sentence = self.tokenize(sentence) + self.list_synonym = self.modify_sent(self.sentence, p=p) + new_sentences = [] + for x in list(itertools.product(*self.list_synonym))[0:n_sent]: + new_sentences.append(x) + return new_sentences
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/lm/wangchanberta.html b/5.1/_modules/pythainlp/augment/lm/wangchanberta.html new file mode 100644 index 0000000..26492ad --- /dev/null +++ b/5.1/_modules/pythainlp/augment/lm/wangchanberta.html @@ -0,0 +1,231 @@ + + + + + + + + pythainlp.augment.lm.wangchanberta — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.lm.wangchanberta

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import (
+    CamembertTokenizer,
+    pipeline,
+)
+
+model_name = "airesearch/wangchanberta-base-att-spm-uncased"
+
+
+
+[docs] +class Thai2transformersAug: +
+[docs] + def __init__(self): + self.model_name = "airesearch/wangchanberta-base-att-spm-uncased" + self.target_tokenizer = CamembertTokenizer + self.tokenizer = CamembertTokenizer.from_pretrained( + self.model_name, revision="main" + ) + self.tokenizer.additional_special_tokens = [ + "<s>NOTUSED", + "</s>NOTUSED", + "<_>", + ] + self.fill_mask = pipeline( + task="fill-mask", + tokenizer=self.tokenizer, + model=f"{self.model_name}", + revision="main", + ) + self.MASK_TOKEN = self.tokenizer.mask_token
+ + +
+[docs] + def generate(self, sentence: str, num_replace_tokens: int = 3): + self.sent2 = [] + self.input_text = sentence + sent = [ + i for i in self.tokenizer.tokenize(self.input_text) if i != "▁" + ] + if len(sent) < num_replace_tokens: + num_replace_tokens = len(sent) + masked_text = self.input_text + for i in range(num_replace_tokens): + masked_text = masked_text + self.MASK_TOKEN + self.sent2 += [ + str(j["sequence"]).replace("<s> ", "").replace("</s>", "") + for j in self.fill_mask(masked_text) + if j["sequence"] not in self.sent2 + ] + masked_text = self.input_text + return self.sent2
+ + +
+[docs] + def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]: + """ + Text augmentation from WangchanBERTa + + :param str sentence: Thai sentence + :param int num_replace_tokens: number replace tokens + + :return: list of text augment + :rtype: List[str] + + :Example: + :: + + from pythainlp.augment.lm import Thai2transformersAug + + aug = Thai2transformersAug() + + aug.augment("ช้างมีทั้งหมด 50 ตัว บน") + # output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้', + 'ช้างมีทั้งหมด 50 ตัว บนสุด', + 'ช้างมีทั้งหมด 50 ตัว บนบก', + 'ช้างมีทั้งหมด 50 ตัว บนนั้น', + 'ช้างมีทั้งหมด 50 ตัว บนหัว'] + """ + self.sent2 = [] + self.sent2 = self.generate(sentence, num_replace_tokens) + return self.sent2
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/word2vec/bpemb_wv.html b/5.1/_modules/pythainlp/augment/word2vec/bpemb_wv.html new file mode 100644 index 0000000..015a80a --- /dev/null +++ b/5.1/_modules/pythainlp/augment/word2vec/bpemb_wv.html @@ -0,0 +1,222 @@ + + + + + + + + pythainlp.augment.word2vec.bpemb_wv — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.word2vec.bpemb_wv

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+from pythainlp.augment.word2vec.core import Word2VecAug
+
+
+
+[docs] +class BPEmbAug: + """ + Thai Text Augment using word2vec from BPEmb + + BPEmb: + `github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_ + """ + +
+[docs] + def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300): + from bpemb import BPEmb + + self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs) + self.model = self.bpemb_temp.emb + self.load_w2v()
+ + +
+[docs] + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: Thai text + :rtype: List[str] + """ + return self.bpemb_temp.encode(text)
+ + +
+[docs] + def load_w2v(self): + """ + Load BPEmb model + """ + self.aug = Word2VecAug( + self.model, tokenize=self.tokenizer, type="model" + )
+ + +
+[docs] + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from BPEmb + + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word + + :return: list of synonyms + :rtype: List[str] + :Example: + :: + + from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug + + aug = BPEmbAug() + aug.augment("ผมเรียน", n_sent=2, p=0.5) + # output: ['ผมสอน', 'ผมเข้าเรียน'] + """ + self.sentence = sentence.replace(" ", "▁") + self.temp = self.aug.augment(self.sentence, n_sent, p=p) + self.temp_new = [] + for i in self.temp: + self.t = "" + for j in i: + self.t += j.replace("▁", "") + self.temp_new.append(self.t) + return self.temp_new
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/word2vec/core.html b/5.1/_modules/pythainlp/augment/word2vec/core.html new file mode 100644 index 0000000..db7afaf --- /dev/null +++ b/5.1/_modules/pythainlp/augment/word2vec/core.html @@ -0,0 +1,215 @@ + + + + + + + + pythainlp.augment.word2vec.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.word2vec.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import itertools
+from typing import List, Tuple
+
+
+
+[docs] +class Word2VecAug: +
+[docs] + def __init__( + self, model: str, tokenize: object, type: str = "file" + ) -> None: + """ + :param str model: path of model + :param object tokenize: tokenize function + :param str type: model type (file, binary) + """ + import gensim.models.keyedvectors as word2vec + + self.tokenizer = tokenize + if type == "file": + self.model = word2vec.KeyedVectors.load_word2vec_format(model) + elif type == "binary": + self.model = word2vec.KeyedVectors.load_word2vec_format( + model, binary=True, unicode_errors="ignore" + ) + else: + self.model = model + self.dict_wv = list(self.model.key_to_index.keys())
+ + +
+[docs] + def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: + """ + :param str sent: text of sentence + :param float p: probability + :rtype: List[List[str]] + """ + list_sent_new = [] + for i in sent: + if i in self.dict_wv: + w = [j for j, v in self.model.most_similar(i) if v >= p] + if w == []: + list_sent_new.append([i]) + else: + list_sent_new.append(w) + else: + list_sent_new.append([i]) + return list_sent_new
+ + +
+[docs] + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + :param str sentence: text of sentence + :param int n_sent: maximum number of synonymous sentences + :param int p: probability + + :return: list of synonyms + :rtype: List[Tuple[str]] + """ + self.sentence = self.tokenizer(sentence) + self.list_synonym = self.modify_sent(self.sentence, p=p) + new_sentences = [] + for x in list(itertools.product(*self.list_synonym))[0:n_sent]: + new_sentences.append(x) + return new_sentences
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/word2vec/ltw2v.html b/5.1/_modules/pythainlp/augment/word2vec/ltw2v.html new file mode 100644 index 0000000..05298ae --- /dev/null +++ b/5.1/_modules/pythainlp/augment/word2vec/ltw2v.html @@ -0,0 +1,212 @@ + + + + + + + + pythainlp.augment.word2vec.ltw2v — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.word2vec.ltw2v

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import word_tokenize
+
+
+
+[docs] +class LTW2VAug: + """ + Text Augment using word2vec from LTW2V + + LTW2V: + `github.com/PyThaiNLP/large-thaiword2vec <https://github.com/PyThaiNLP/large-thaiword2vec>`_ + """ + +
+[docs] + def __init__(self): + self.ltw2v_wv = get_corpus_path("ltw2v") + self.load_w2v()
+ + +
+[docs] + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: Thai text + :rtype: List[str] + """ + return word_tokenize(text, engine="newmm")
+ + +
+[docs] + def load_w2v(self): # insert substitute + """ + Load LTW2V's word2vec model + """ + self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary")
+ + +
+[docs] + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from Thai2Fit + + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word + + :return: list of text augmented + :rtype: List[Tuple[str]] + + :Example: + :: + + from pythainlp.augment.word2vec import LTW2VAug + + aug = LTW2VAug() + aug.augment("ผมเรียน", n_sent=2, p=0.5) + # output: [('เขา', 'เรียนหนังสือ'), ('เขา', 'สมัครเรียน')] + """ + return self.aug.augment(sentence, n_sent, p)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/word2vec/thai2fit.html b/5.1/_modules/pythainlp/augment/word2vec/thai2fit.html new file mode 100644 index 0000000..3dc4779 --- /dev/null +++ b/5.1/_modules/pythainlp/augment/word2vec/thai2fit.html @@ -0,0 +1,212 @@ + + + + + + + + pythainlp.augment.word2vec.thai2fit — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.augment.word2vec.thai2fit

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+from pythainlp.augment.word2vec.core import Word2VecAug
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import THAI2FIT_TOKENIZER
+
+
+
+[docs] +class Thai2fitAug: + """ + Text Augment using word2vec from Thai2Fit + + Thai2Fit: + `github.com/cstorm125/thai2fit <https://github.com/cstorm125/thai2fit>`_ + """ + +
+[docs] + def __init__(self): + self.thai2fit_wv = get_corpus_path("thai2fit_wv") + self.load_w2v()
+ + +
+[docs] + def tokenizer(self, text: str) -> List[str]: + """ + :param str text: Thai text + :rtype: List[str] + """ + return THAI2FIT_TOKENIZER.word_tokenize(text)
+ + +
+[docs] + def load_w2v(self): + """ + Load Thai2Fit's word2vec model + """ + self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary")
+ + +
+[docs] + def augment( + self, sentence: str, n_sent: int = 1, p: float = 0.7 + ) -> List[Tuple[str]]: + """ + Text Augment using word2vec from Thai2Fit + + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word + + :return: list of text augmented + :rtype: List[Tuple[str]] + + :Example: + :: + + from pythainlp.augment.word2vec import Thai2fitAug + + aug = Thai2fitAug() + aug.augment("ผมเรียน", n_sent=2, p=0.5) + # output: [('พวกเรา', 'เรียน'), ('ฉัน', 'เรียน')] + """ + return self.aug.augment(sentence, n_sent, p)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/augment/wordnet.html b/5.1/_modules/pythainlp/augment/wordnet.html new file mode 100644 index 0000000..ad63509 --- /dev/null +++ b/5.1/_modules/pythainlp/augment/wordnet.html @@ -0,0 +1,367 @@ + + + + + + + + pythainlp.augment.wordnet — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.augment.wordnet

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
+"""
+__all__ = [
+    "WordNetAug",
+    "postype2wordnet",
+]
+
+import itertools
+from collections import OrderedDict
+from typing import List
+
+from nltk.corpus import wordnet as wn
+
+from pythainlp.corpus import wordnet
+from pythainlp.tag import pos_tag
+from pythainlp.tokenize import word_tokenize
+
+orchid = {
+    "": "",
+    # NOUN
+    "NOUN": wn.NOUN,
+    "NCMN": wn.NOUN,
+    "NTTL": wn.NOUN,
+    "CNIT": wn.NOUN,
+    "CLTV": wn.NOUN,
+    "CMTR": wn.NOUN,
+    "CFQC": wn.NOUN,
+    "CVBL": wn.NOUN,
+    # VERB
+    "VACT": wn.VERB,
+    "VSTA": wn.VERB,
+    # PROPN
+    "PROPN": "",
+    "NPRP": "",
+    # ADJ
+    "ADJ": wn.ADJ,
+    "NONM": wn.ADJ,
+    "VATT": wn.ADJ,
+    "DONM": wn.ADJ,
+    # ADV
+    "ADV": wn.ADV,
+    "ADVN": wn.ADV,
+    "ADVI": wn.ADV,
+    "ADVP": wn.ADV,
+    "ADVS": wn.ADV,
+    # INT
+    "INT": "",
+    # PRON
+    "PRON": "",
+    "PPRS": "",
+    "PDMN": "",
+    "PNTR": "",
+    # DET
+    "DET": "",
+    "DDAN": "",
+    "DDAC": "",
+    "DDBQ": "",
+    "DDAQ": "",
+    "DIAC": "",
+    "DIBQ": "",
+    "DIAQ": "",
+    # NUM
+    "NUM": "",
+    "NCNM": "",
+    "NLBL": "",
+    "DCNM": "",
+    # AUX
+    "AUX": "",
+    "XVBM": "",
+    "XVAM": "",
+    "XVMM": "",
+    "XVBB": "",
+    "XVAE": "",
+    # ADP
+    "ADP": "",
+    "RPRE": "",
+    # CCONJ
+    "CCONJ": "",
+    "JCRG": "",
+    # SCONJ
+    "SCONJ": "",
+    "PREL": "",
+    "JSBR": "",
+    "JCMP": "",
+    # PART
+    "PART": "",
+    "FIXN": "",
+    "FIXV": "",
+    "EAFF": "",
+    "EITT": "",
+    "AITT": "",
+    "NEG": "",
+    # PUNCT
+    "PUNCT": "",
+    "PUNC": "",
+}
+
+
+def postype2wordnet(pos: str, corpus: str):
+    """
+    Convert part-of-speech type to wordnet type
+
+    :param str pos: POS type
+    :param str corpus: part-of-speech corpus
+
+    **Options for corpus**
+        * *orchid* - Orchid Corpus
+    """
+    if corpus not in ["orchid"]:
+        return None
+    return orchid[pos]
+
+
+
+[docs] +class WordNetAug: + """ + Text Augment using wordnet + """ + +
+[docs] + def __init__(self): + pass
+ + +
+[docs] + def find_synonyms( + self, word: str, pos: str = None, postag_corpus: str = "orchid" + ) -> List[str]: + """ + Find synonyms using wordnet + + :param str word: word + :param str pos: part-of-speech type + :param str postag_corpus: name of POS tag corpus + :return: list of synonyms + :rtype: List[str] + """ + self.synonyms = [] + if pos is None: + self.list_synsets = wordnet.synsets(word) + else: + self.p2w_pos = postype2wordnet(pos, postag_corpus) + if self.p2w_pos != "": + self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos) + else: + self.list_synsets = wordnet.synsets(word) + + for self.synset in wordnet.synsets(word): + for self.syn in self.synset.lemma_names(lang="tha"): + self.synonyms.append(self.syn) + + self.synonyms_without_duplicates = list( + OrderedDict.fromkeys(self.synonyms) + ) + return self.synonyms_without_duplicates
+ + +
+[docs] + def augment( + self, + sentence: str, + tokenize: object = word_tokenize, + max_syn_sent: int = 6, + postag: bool = True, + postag_corpus: str = "orchid", + ) -> List[List[str]]: + """ + Text Augment using wordnet + + :param str sentence: Thai sentence + :param object tokenize: function for tokenizing words + :param int max_syn_sent: maximum number of synonymous sentences + :param bool postag: use part-of-speech + :param str postag_corpus: name of POS tag corpus + + :return: list of synonyms + :rtype: List[Tuple[str]] + + :Example: + :: + + from pythainlp.augment import WordNetAug + + aug = WordNetAug() + aug.augment("เราชอบไปโรงเรียน") + # output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'), + ('เรา', 'ชอบ', 'ไป', 'รร.'), + ('เรา', 'ชอบ', 'ไป', 'โรงเรียน'), + ('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'), + ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'), + ('เรา', 'ชอบ', 'ไปยัง', 'รร.')] + """ + new_sentences = [] + self.list_words = tokenize(sentence) + self.list_synonym = [] + self.p_all = 1 + if postag: + self.list_pos = pos_tag(self.list_words, corpus=postag_corpus) + for word, pos in self.list_pos: + self.temp = self.find_synonyms(word, pos, postag_corpus) + if not self.temp: + self.list_synonym.append([word]) + else: + self.list_synonym.append(self.temp) + self.p_all *= len(self.temp) + else: + for word in self.list_words: + self.temp = self.find_synonyms(word) + if not self.temp: + self.list_synonym.append([word]) + else: + self.list_synonym.append(self.temp) + self.p_all *= len(self.temp) + if max_syn_sent > self.p_all: + max_syn_sent = self.p_all + for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]: + new_sentences.append(x) + return new_sentences
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/benchmarks/word_tokenization.html b/5.1/_modules/pythainlp/benchmarks/word_tokenization.html new file mode 100644 index 0000000..5d2439f --- /dev/null +++ b/5.1/_modules/pythainlp/benchmarks/word_tokenization.html @@ -0,0 +1,421 @@ + + + + + + + + pythainlp.benchmarks.word_tokenization — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.benchmarks.word_tokenization

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+import sys
+from typing import List, Tuple
+
+import numpy as np
+import pandas as pd
+
+SEPARATOR = "|"
+
+# regex for removing one space surrounded by separators, i.e. | |
+SURROUNDING_SEPS_RX = re.compile(
+    "{sep}? ?{sep}$".format(sep=re.escape(SEPARATOR))
+)
+
+# regex for removing repeated separators, i.e. ||||
+MULTIPLE_SEPS_RX = re.compile("{sep}+".format(sep=re.escape(SEPARATOR)))
+
+# regex for removing tags, i.e. <NE>, </NE>
+TAG_RX = re.compile(r"<\/?[A-Z]+>")
+
+# regex for removing trailing separators, i.e.  a|dog| -> a|dog
+TAILING_SEP_RX = re.compile("{sep}$".format(sep=re.escape(SEPARATOR)))
+
+
+def _f1(precision: float, recall: float) -> float:
+    """
+    Compute f1.
+
+    :param float precision
+    :param float recall
+
+    :return: f1
+    :rtype: float
+    """
+    if precision == recall == 0:
+        return 0
+    return 2 * precision * recall / (precision + recall)
+
+
+def _flatten_result(my_dict: dict, sep: str = ":") -> dict:
+    """
+    Flatten two-dimension dictionary.
+
+    Use keys in the first dimension as a prefix for keys in the second dimension.
+    For example,
+    my_dict = { "a": { "b": 7 } }
+    flatten(my_dict)
+    { "a:b": 7 }
+
+
+    :param dict my_dict: dictionary containing stats
+    :param str sep: separator between the two keys (default: ":")
+
+    :return: a one-dimension dictionary with keys combined
+    :rtype: dict[str, float | str]
+    """
+    items = []
+    for k1, kv2 in my_dict.items():
+        for k2, v in kv2.items():
+            new_key = f"{k1}{sep}{k2}"
+            items.append((new_key, v))
+
+    return dict(items)
+
+
+
+[docs] +def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame: + """ + Performance benchmarking for samples. + + Please see :meth:`pythainlp.benchmarks.word_tokenization.compute_stats` for + the computed metrics. + + :param list[str] ref_samples: ground truth for samples + :param list[str] samples: samples that we want to evaluate + + :return: dataframe with row x col = len(samples) x len(metrics) + :rtype: pandas.DataFrame + """ + results = [] + for i, (r, s) in enumerate(zip(ref_samples, samples)): + try: + r, s = preprocessing(r), preprocessing(s) + if r and s: + stats = compute_stats(r, s) + stats = _flatten_result(stats) + stats["expected"] = r + stats["actual"] = s + results.append(stats) + except: + reason = """ +[Error] +Reason: %s + +Pair (i=%d) +--- label +%s +--- sample +%s +""" % ( + sys.exc_info(), + i, + r, + s, + ) + raise SystemExit(reason) + + return pd.DataFrame(results)
+ + + +
+[docs] +def preprocessing(txt: str, remove_space: bool = True) -> str: + """ + Clean up text before performing evaluation. + + :param str text: text to be preprocessed + :param bool remove_space: whether to remove white space + + :return: preprocessed text + :rtype: str + """ + txt = re.sub(SURROUNDING_SEPS_RX, "", txt) + + if remove_space: + txt = re.sub(r"\s+", "", txt) + + txt = re.sub(MULTIPLE_SEPS_RX, SEPARATOR, txt) + + txt = re.sub(TAG_RX, "", txt) + + txt = re.sub(TAILING_SEP_RX, "", txt).strip() + + return txt
+ + + +
+[docs] +def compute_stats(ref_sample: str, raw_sample: str) -> dict: + """ + Compute statistics for tokenization quality + + These statistics include: + + **Character-Level**: + True Positive, False Positive, True Negative, False Negative, Precision, Recall, and f1 + **Word-Level**: + Precision, Recall, and f1 + **Other**: + - Correct tokenization indicator: {0, 1} sequence indicating that the corresponding + word is tokenized correctly. + + :param str ref_sample: ground truth for samples + :param str samples: samples that we want to evaluate + + :return: metrics at character- and word-level and indicators of correctly tokenized words + :rtype: dict[str, float | str] + """ + ref_sample = _binary_representation(ref_sample) + sample = _binary_representation(raw_sample) + + # Compute character-level statistics + c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0) + + c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]] + c_neg_pred = c_neg_pred[c_neg_pred < ref_sample.shape[0]] + + c_tp = np.sum(ref_sample[c_pos_pred] == 1) + c_fp = np.sum(ref_sample[c_pos_pred] == 0) + + c_tn = np.sum(ref_sample[c_neg_pred] == 0) + c_fn = np.sum(ref_sample[c_neg_pred] == 1) + + # Compute word-level statistics + + # Find correctly tokenized words in the reference sample + word_boundaries = _find_word_boundaries(ref_sample) + + # Find correctly tokenized words in the sample + ss_boundaries = _find_word_boundaries(sample) + tokenization_indicators = _find_words_correctly_tokenised( + word_boundaries, ss_boundaries + ) + + correctly_tokenised_words = np.sum(tokenization_indicators) + + tokenization_indicators = list( + map(str, tokenization_indicators) + ) + + return { + "char_level": { + "tp": c_tp, + "fp": c_fp, + "tn": c_tn, + "fn": c_fn, + }, + "word_level": { + "correctly_tokenised_words": correctly_tokenised_words, + "total_words_in_sample": np.sum(sample), + "total_words_in_ref_sample": np.sum(ref_sample), + }, + "global": { + "tokenisation_indicators": "".join(tokenization_indicators) + }, + }
+ + + +def _binary_representation(txt: str, verbose: bool = False): + """ + Transform text into {0, 1} sequence. + + where (1) indicates that the corresponding character is the beginning of + a word. For example, ผม|ไม่|ชอบ|กิน|ผัก -> 10100... + + :param str txt: input text that we want to transform + :param bool verbose: for debugging purposes + + :return: {0, 1} sequence + :rtype: str + """ + chars = np.array(list(txt)) + + boundary = np.argwhere(chars == SEPARATOR).reshape(-1) + boundary = boundary - np.array(range(boundary.shape[0])) + + bin_rept = np.zeros(len(txt) - boundary.shape[0]) + bin_rept[list(boundary) + [0]] = 1 + + sample_wo_seps = list(txt.replace(SEPARATOR, "")) + + # sanity check + assert len(sample_wo_seps) == len(bin_rept) + + if verbose: + for c, m in zip(sample_wo_seps, bin_rept): + print("%s -- %d" % (c, m)) + + return bin_rept + + +def _find_word_boundaries(bin_reps) -> list: + """ + Find the starting and ending location of each word. + + :param str bin_reps: binary representation of a text + + :return: list of tuples (start, end) + :rtype: list[tuple(int, int)] + """ + boundary = np.argwhere(bin_reps == 1).reshape(-1) + start_idx = boundary + end_idx = boundary[1:].tolist() + [bin_reps.shape[0]] + + return list(zip(start_idx, end_idx)) + + +def _find_words_correctly_tokenised( + ref_boundaries: List[Tuple[int, int]], + predicted_boundaries: List[Tuple[int, int]], +) -> Tuple[int]: + """ + Find whether each word is correctly tokenized. + + :param list[tuple(int, int)] ref_boundaries: word boundaries of reference tokenization + :param list[tuple(int, int)] predicted_boundaries: word boundareies of predicted tokenization + + :return: binary sequence where 1 indicates the corresponding word is tokenized correctly + :rtype: tuple[int] + """ + ref_b = dict(zip(ref_boundaries, [1] * len(ref_boundaries))) + + labels = tuple(map(lambda x: ref_b.get(x, 0), predicted_boundaries)) + return labels +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/chat/core.html b/5.1/_modules/pythainlp/chat/core.html new file mode 100644 index 0000000..eee432d --- /dev/null +++ b/5.1/_modules/pythainlp/chat/core.html @@ -0,0 +1,247 @@ + + + + + + + + pythainlp.chat.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.chat.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+
+[docs] +class ChatBotModel: +
+[docs] + def __init__(self): + """ + Chat using AI generation + """ + self.history = []
+ + +
+[docs] + def reset_chat(self): + """ + Reset chat by cleaning history + """ + self.history = []
+ + +
+[docs] + def load_model( + self, + model_name: str = "wangchanglm", + return_dict: bool = True, + load_in_8bit: bool = False, + device: str = "cuda", + torch_dtype=torch.float16, + offload_folder: str = "./", + low_cpu_mem_usage: bool = True, + ): + """ + Load model + + :param str model_name: Model name (Now, we support wangchanglm only) + :param bool return_dict: return_dict + :param bool load_in_8bit: load model in 8bit + :param str device: device (cpu, cuda or other) + :param torch_dtype torch_dtype: torch_dtype + :param str offload_folder: offload folder + :param bool low_cpu_mem_usage: low cpu mem usage + """ + if model_name == "wangchanglm": + from pythainlp.generate.wangchanglm import WangChanGLM + + self.model = WangChanGLM() + self.model.load_model( + model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded", + return_dict=return_dict, + load_in_8bit=load_in_8bit, + offload_folder=offload_folder, + device=device, + torch_dtype=torch_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + else: + raise NotImplementedError(f"We doesn't support {model_name}.")
+ + +
+[docs] + def chat(self, text: str) -> str: + """ + Chatbot + + :param str text: text for asking chatbot with. + :return: answer from chatbot. + :rtype: str + :Example: + :: + + from pythainlp.chat import ChatBotModel + import torch + + chatbot = ChatBotModel() + chatbot.load_model(device="cpu",torch_dtype=torch.bfloat16) + + print(chatbot.chat("สวัสดี")) + # output: ยินดีที่ได้รู้จัก + + print(chatbot.history) + # output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')] + """ + _temp = "" + if self.history: + for h, b in self.history: + _temp += ( + self.model.PROMPT_DICT["prompt_chatbot"].format_map( + {"human": h, "bot": b} + ) + + self.model.stop_token + ) + _temp += self.model.PROMPT_DICT["prompt_chatbot"].format_map( + {"human": text, "bot": ""} + ) + _bot = self.model.gen_instruct(_temp) + self.history.append((text, _bot)) + return _bot
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/classify/param_free.html b/5.1/_modules/pythainlp/classify/param_free.html new file mode 100644 index 0000000..2bdec5a --- /dev/null +++ b/5.1/_modules/pythainlp/classify/param_free.html @@ -0,0 +1,253 @@ + + + + + + + + pythainlp.classify.param_free — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.classify.param_free

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+import gzip
+import json
+from typing import List, Tuple
+
+import numpy as np
+
+
+
+[docs] +class GzipModel: + """ + This class is a re-implementation of + “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors + (Jiang et al., Findings 2023) + + :param list training_data: list [(text_sample,label)] + :param str model_path: Path for loading model (if you saved the model) + """ + +
+[docs] + def __init__(self, training_data: List[Tuple[str, str]] = None, model_path: str = None): + if model_path is not None: + self.load(model_path) + else: + self.training_data = np.array(training_data) + self.Cx2_list = self.train()
+ + +
+[docs] + def train(self): + Cx2_list = [] + for i in range(len(self.training_data)): + Cx2_list.append( + len(gzip.compress(self.training_data[i][0].encode("utf-8"))) + ) + return Cx2_list
+ + +
+[docs] + def predict(self, x1: str, k: int = 1) -> str: + """ + :param str x1: the text that we want to predict label for. + :param str k: k + :return: label + :rtype: str + + :Example: + :: + + from pythainlp.classify import GzipModel + + training_data = [ + ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), + ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"), + ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"), + ("ขับรถแย่มาก", "Negative"), + ("ดีนะครับ", "Positive"), + ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"), + ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"), + ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"), + ("นี่เป็นบทความหนึ่ง", "Neutral") + ] + model = GzipModel(training_data) + print(model.predict("ฉันดีใจ", k=1)) + # output: Positive + """ + Cx1 = len(gzip.compress(x1.encode("utf-8"))) + disance_from_x1 = [] + for i in range(len(self.Cx2_list)): + x2 = self.training_data[i][0] + Cx2 = self.Cx2_list[i] + x1x2 = "".join([x1, x2]) + Cx1x2 = len(gzip.compress(x1x2.encode("utf-8"))) + # normalized compression distance + ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2) + disance_from_x1.append(ncd) + + sorted_idx = np.argsort(np.array(disance_from_x1)) + top_k_class = self.training_data[sorted_idx[:k], 1] + _, counts = np.unique(top_k_class, return_counts=True) + predict_class = top_k_class[counts.argmax()] + + return predict_class
+ + +
+[docs] + def save(self, path: str): + """ + :param str path: path for save model + """ + with open(path, "w") as f: + json.dump({ + "training_data": self.training_data.tolist(), + "Cx2_list": self.Cx2_list + }, f, ensure_ascii=False)
+ + +
+[docs] + def load(self, path: str): + with open(path, "r") as f: + data = json.load(f) + self.Cx2_list = data["Cx2_list"] + self.training_data = np.array(data["training_data"])
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/coref/core.html b/5.1/_modules/pythainlp/coref/core.html new file mode 100644 index 0000000..52060ce --- /dev/null +++ b/5.1/_modules/pythainlp/coref/core.html @@ -0,0 +1,199 @@ + + + + + + + + pythainlp.coref.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.coref.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List
+
+_MODEL = None
+
+
+
+[docs] +def coreference_resolution( + texts: List[str], model_name: str = "han-coref-v1.0", device: str = "cpu" +): + """ + Coreference Resolution + + :param List[str] texts: list of texts to apply coreference resolution to + :param str model_name: coreference resolution model + :param str device: device for running coreference resolution model on\ + ("cpu", "cuda", and others) + :return: List of texts with coreference resolution + :rtype: List[dict] + + :Options for model_name: + * *han-coref-v1.0* - (default) Han-Coref: Thai coreference resolution\ + by PyThaiNLP v1.0 + + :Example: + :: + + from pythainlp.coref import coreference_resolution + + print( + coreference_resolution( + ["Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"] + ) + ) + # output: + # [ + # {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก', + # 'clusters_string': [['Bill Gates', 'ผม']], + # 'clusters': [[(0, 10), (50, 52)]]} + # ] + """ + global _MODEL + if isinstance(texts, str): + texts = [texts] + + if _MODEL is None and model_name == "han-coref-v1.0": + from pythainlp.coref.han_coref import HanCoref + + _MODEL = HanCoref(device=device) + + if _MODEL: + return _MODEL.predict(texts) + + return [ + {"text": text, "clusters_string": [], "clusters": []} for text in texts + ]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/common.html b/5.1/_modules/pythainlp/corpus/common.html new file mode 100644 index 0000000..f5ff900 --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/common.html @@ -0,0 +1,572 @@ + + + + + + + + pythainlp.corpus.common — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.common

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Common lists of words.
+"""
+
+import ast
+
+__all__ = [
+    "countries",
+    "find_synonyms",
+    "provinces",
+    "thai_family_names",
+    "thai_female_names",
+    "thai_male_names",
+    "thai_negations",
+    "thai_dict",
+    "thai_stopwords",
+    "thai_syllables",
+    "thai_synonym",
+    "thai_synonyms",
+    "thai_words",
+    "thai_wsd_dict",
+]
+
+from typing import FrozenSet, List, Union
+
+from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path
+from pythainlp.tools import warn_deprecation
+
+_THAI_COUNTRIES: FrozenSet[str] = frozenset()
+_THAI_COUNTRIES_FILENAME = "countries_th.txt"
+
+_THAI_THAILAND_PROVINCES: FrozenSet[str] = frozenset()
+_THAI_THAILAND_PROVINCES_DETAILS: List[dict] = []
+_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"
+
+_THAI_SYLLABLES: FrozenSet[str] = frozenset()
+_THAI_SYLLABLES_FILENAME = "syllables_th.txt"
+
+_THAI_WORDS: FrozenSet[str] = frozenset()
+_THAI_WORDS_FILENAME = "words_th.txt"
+
+_THAI_STOPWORDS: FrozenSet[str] = frozenset()
+_THAI_STOPWORDS_FILENAME = "stopwords_th.txt"
+
+_THAI_NEGATIONS: FrozenSet[str] = frozenset()
+_THAI_NEGATIONS_FILENAME = "negations_th.txt"
+
+_THAI_FAMLIY_NAMES: FrozenSet[str] = frozenset()
+_THAI_FAMLIY_NAMES_FILENAME = "family_names_th.txt"
+_THAI_FEMALE_NAMES: FrozenSet[str] = frozenset()
+_THAI_FEMALE_NAMES_FILENAME = "person_names_female_th.txt"
+_THAI_MALE_NAMES: FrozenSet[str] = frozenset()
+_THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"
+
+_THAI_ORST_WORDS: FrozenSet[str] = frozenset()
+
+_THAI_DICT: dict[str, list] = {}
+_THAI_WSD_DICT: dict[str, list] = {}
+_THAI_SYNONYMS: dict[str, list] = {}
+
+
+
+[docs] +def countries() -> FrozenSet[str]: + """ + Return a frozenset of country names in Thai such as "แคนาดา", "โรมาเนีย", + "แอลจีเรีย", and "ลาว". + \n(See: `dev/pythainlp/corpus/countries_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/countries_th.txt>`_) + + :return: :class:`frozenset` containing country names in Thai + :rtype: :class:`frozenset` + """ + global _THAI_COUNTRIES + if not _THAI_COUNTRIES: + _THAI_COUNTRIES = get_corpus(_THAI_COUNTRIES_FILENAME) + + return _THAI_COUNTRIES
+ + + +
+[docs] +def provinces(details: bool = False) -> Union[FrozenSet[str], List[dict]]: + """ + Return a frozenset of Thailand province names in Thai such as "กระบี่", + "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี". + \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_) + + :param bool details: return details of provinces or not + + :return: :class:`frozenset` containing province names of Thailand \ + (if details is False) or :class:`list` containing :class:`dict` of \ + province names and details such as \ + [{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \ + 'abbr_en': 'NBI'}]. + :rtype: :class:`frozenset` or :class:`list` + """ + global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS + + if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS: + provs = set() + prov_details = [] + + for line in get_corpus_as_is(_THAI_THAILAND_PROVINCES_FILENAME): + p = line.split(",") + + prov = {} + prov["name_th"] = p[0] + prov["abbr_th"] = p[1] + prov["name_en"] = p[2] + prov["abbr_en"] = p[3] + + provs.add(prov["name_th"]) + prov_details.append(prov) + + _THAI_THAILAND_PROVINCES = frozenset(provs) + _THAI_THAILAND_PROVINCES_DETAILS = prov_details + + if details: + return _THAI_THAILAND_PROVINCES_DETAILS + + return _THAI_THAILAND_PROVINCES
+ + + +
+[docs] +def thai_syllables() -> FrozenSet[str]: + """ + Return a frozenset of Thai syllables such as "กรอบ", "ก็", "๑", "โมบ", + "โมน", "โม่ง", "กา", "ก่า", and, "ก้า". + \n(See: `dev/pythainlp/corpus/syllables_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/syllables_th.txt>`_) + We use the Thai syllable list from `KUCut <https://github.com/Thanabhat/KUCut>`_. + + :return: :class:`frozenset` containing syllables in the Thai language. + :rtype: :class:`frozenset` + """ + global _THAI_SYLLABLES + if not _THAI_SYLLABLES: + _THAI_SYLLABLES = get_corpus(_THAI_SYLLABLES_FILENAME) + + return _THAI_SYLLABLES
+ + + +
+[docs] +def thai_words() -> FrozenSet[str]: + """ + Return a frozenset of Thai words such as "กติกา", "กดดัน", "พิษ", + and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt>`_) + + :return: :class:`frozenset` containing words in the Thai language. + :rtype: :class:`frozenset` + """ + global _THAI_WORDS + if not _THAI_WORDS: + _THAI_WORDS = get_corpus(_THAI_WORDS_FILENAME) + + return _THAI_WORDS
+ + + +
+[docs] +def thai_orst_words() -> FrozenSet[str]: + """ + Return a frozenset of Thai words from Royal Society of Thailand + \n(See: `dev/pythainlp/corpus/thai_orst_words.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/orst_words_th.txt>`_) + + :return: :class:`frozenset` containing words in the Thai language. + :rtype: :class:`frozenset` + """ + global _THAI_ORST_WORDS + if not _THAI_ORST_WORDS: + _THAI_ORST_WORDS = get_corpus("orst_words_th.txt") + + return _THAI_ORST_WORDS
+ + + +
+[docs] +def thai_stopwords() -> FrozenSet[str]: + """ + Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ", + "การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/stopwords_th.txt>`_) + We use stopword lists by thesis's เพ็ญศิริ ลี้ตระกูล. + + :See Also: + + เพ็ญศิริ ลี้ตระกูล . \ + การเลือกประโยคสำคัญในการสรุปความภาษาไทยโดยใช้แบบจำลองแบบลำดับชั้น. \ + กรุงเทพมหานคร : มหาวิทยาลัยธรรมศาสตร์; 2551. + + :return: :class:`frozenset` containing stopwords. + :rtype: :class:`frozenset` + """ + global _THAI_STOPWORDS + if not _THAI_STOPWORDS: + _THAI_STOPWORDS = get_corpus(_THAI_STOPWORDS_FILENAME) + + return _THAI_STOPWORDS
+ + + +
+[docs] +def thai_negations() -> FrozenSet[str]: + """ + Return a frozenset of Thai negation words including "ไม่" and "แต่". + \n(See: `dev/pythainlp/corpus/negations_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/negations_th.txt>`_) + + :return: :class:`frozenset` containing negations in the Thai language. + :rtype: :class:`frozenset` + """ + global _THAI_NEGATIONS + if not _THAI_NEGATIONS: + _THAI_NEGATIONS = get_corpus(_THAI_NEGATIONS_FILENAME) + + return _THAI_NEGATIONS
+ + + +
+[docs] +def thai_family_names() -> FrozenSet[str]: + """ + Return a frozenset of Thai family names + \n(See: `dev/pythainlp/corpus/family_names_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/family_names_th.txt>`_) + + :return: :class:`frozenset` containing Thai family names. + :rtype: :class:`frozenset` + """ + global _THAI_FAMLIY_NAMES + if not _THAI_FAMLIY_NAMES: + _THAI_FAMLIY_NAMES = get_corpus(_THAI_FAMLIY_NAMES_FILENAME) + + return _THAI_FAMLIY_NAMES
+ + + +
+[docs] +def thai_female_names() -> FrozenSet[str]: + """ + Return a frozenset of Thai female names + \n(See: `dev/pythainlp/corpus/person_names_female_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/person_names_female_th.txt>`_) + + :return: :class:`frozenset` containing Thai female names. + :rtype: :class:`frozenset` + """ + global _THAI_FEMALE_NAMES + if not _THAI_FEMALE_NAMES: + _THAI_FEMALE_NAMES = get_corpus(_THAI_FEMALE_NAMES_FILENAME) + + return _THAI_FEMALE_NAMES
+ + + +
+[docs] +def thai_male_names() -> FrozenSet[str]: + """ + Return a frozenset of Thai male names + \n(See: `dev/pythainlp/corpus/person_names_male_th.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/person_names_male_th.txt>`_) + + :return: :class:`frozenset` containing Thai male names. + :rtype: :class:`frozenset` + """ + global _THAI_MALE_NAMES + if not _THAI_MALE_NAMES: + _THAI_MALE_NAMES = get_corpus(_THAI_MALE_NAMES_FILENAME) + + return _THAI_MALE_NAMES
+ + + +
+[docs] +def thai_dict() -> dict: + """ + Return Thai dictionary with definition from wiktionary. + \n(See: `thai_dict\ + <https://pythainlp.org/pythainlp-corpus/thai_dict.html>`_) + + :return: Thai words with part-of-speech type and definition + :rtype: dict + """ + global _THAI_DICT + if _THAI_DICT: + return _THAI_DICT + + import csv + + path = get_corpus_path("thai_dict") + if not path: + return _THAI_DICT + path = str(path) + + _THAI_DICT = {"word": [], "meaning": []} + with open(path, newline="\n", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter=",") + for row in reader: + _THAI_DICT["word"].append(row["word"]) + _THAI_DICT["meaning"].append(row["meaning"]) + + return _THAI_DICT
+ + + +
+[docs] +def thai_wsd_dict() -> dict: + """ + Return Thai Word Sense Disambiguation dictionary with definition from wiktionary. + \n(See: `thai_dict\ + <https://pythainlp.org/pythainlp-corpus/thai_dict.html>`_) + + :return: Thai words with part-of-speech type and definition + :rtype: dict + """ + global _THAI_WSD_DICT + if _THAI_WSD_DICT: + return _THAI_WSD_DICT + + thai_wsd = thai_dict() + _THAI_WSD_DICT = {"word": [], "meaning": []} + for i, j in zip(thai_wsd["word"], thai_wsd["meaning"]): + all_value = list(ast.literal_eval(j).values()) + use = [] + for k in all_value: + use.extend(k) + use = list(set(use)) + if len(use) > 1: + _THAI_WSD_DICT["word"].append(i) + _THAI_WSD_DICT["meaning"].append(use) + + return _THAI_WSD_DICT
+ + + +
+[docs] +def thai_synonyms() -> dict: + """ + Return Thai synonyms. + \n(See: `thai_synonym\ + <https://pythainlp.org/pythainlp-corpus/thai_synonym.html>`_) + + :return: Thai words with part-of-speech type and synonym + :rtype: dict + """ + global _THAI_SYNONYMS + if _THAI_SYNONYMS: + return _THAI_SYNONYMS + + import csv + + path = get_corpus_path("thai_synonym") + if not path: + return _THAI_SYNONYMS + path = str(path) + + _THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []} + with open(path, newline="\n", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter=",") + for row in reader: + _THAI_SYNONYMS["word"].append(row["word"]) + _THAI_SYNONYMS["pos"].append(row["pos"]) + _THAI_SYNONYMS["synonym"].append(row["synonym"].split("|")) + + return _THAI_SYNONYMS
+ + + +def thai_synonym() -> dict: + warn_deprecation( + "pythainlp.corpus.thai_synonym", + "pythainlp.corpus.thai_synonyms", + "5.1", + "5.2", + ) + return thai_synonyms() + + +def find_synonyms(word: str) -> List[str]: + """ + Find synonyms + + :param str word: Thai word + :return: List of synonyms of the input word or an empty list if it isn't exist. + :rtype: List[str] + + :Example: + :: + + from pythainlp.corpus import find_synonyms + + print(find_synonyms("หมู")) + # output: ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร'] + """ + synonyms = thai_synonyms() # get a dictionary of {word, synonym} + list_synonym = [] + + if word in synonyms["word"]: # find by word + list_synonym.extend(synonyms["synonym"][synonyms["word"].index(word)]) + + for idx, words in enumerate(synonyms["synonym"]): # find by synonym + if word in words: + list_synonym.extend(synonyms["synonym"][idx]) + list_synonym.append(synonyms["word"][idx]) + + list_synonym = sorted(list(set(list_synonym))) + + if word in list_synonym: # remove same word + list_synonym.remove(word) + + return list_synonym +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/conceptnet.html b/5.1/_modules/pythainlp/corpus/conceptnet.html new file mode 100644 index 0000000..6ad4994 --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/conceptnet.html @@ -0,0 +1,256 @@ + + + + + + + + pythainlp.corpus.conceptnet — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.conceptnet

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Get data from ConceptNet API at http://conceptnet.io
+"""
+import requests
+
+
+
+[docs] +def edges(word: str, lang: str = "th"): + """ + Get edges from `ConceptNet <http://api.conceptnet.io/>`_ API. + ConceptNet is a public semantic network, designed to help computers + understand the meanings of words that people use. + + For example, the term "ConceptNet" is a "knowledge graph", and + "knowledge graph" has "common sense knowledge" which is a part of + "artificial intelligence". Also, "ConcepNet" is used for + "natural language understanding" which is a part of + "artificial intelligence". + + | "ConceptNet" --is a--> "knowledge graph" --has--> "common sense"\ + --a part of--> "artificial intelligence" + | "ConceptNet" --used for--> "natural language understanding"\ + --a part of--> "artificial intelligence" + + With this illustration, it shows relationships (represented as *Edge*) + between the terms (represented as *Node*). + + This function requires an internet connection to access the ConceptNet API. + Please use it considerately. It will timeout after 10 seconds. + + :param str word: word to be sent to ConceptNet API + :param str lang: abbreviation of language (i.e. *th* for Thai, *en* for + English, or *ja* for Japan). By default, it is *th* + (Thai). + + :return: return edges of the given word according to the + ConceptNet network. + :rtype: list[dict] + + :Example: + :: + + from pythainlp.corpus.conceptnet import edges + + edges('hello', lang='en') + # output: + # [{ + # '@id': '/a/[/r/IsA/,/c/en/hello/,/c/en/greeting/]', + # '@type': 'Edge', + # 'dataset': '/d/conceptnet/4/en', + # 'end': {'@id': '/c/en/greeting', + # '@type': 'Node', + # 'label': 'greeting', + # 'language': 'en', + # 'term': '/c/en/greeting'}, + # 'license': 'cc:by/4.0', + # 'rel': {'@id': '/r/IsA', '@type': 'Relation', 'label': 'IsA'}, + # 'sources': [ + # { + # '@id': '/and/[/s/activity/omcs/vote/,/s/contributor/omcs/bmsacr/]', + # '@type': 'Source', + # 'activity': '/s/activity/omcs/vote', + # 'contributor': '/s/contributor/omcs/bmsacr' + # }, + # { + # '@id': '/and/[/s/activity/omcs/vote/,/s/contributor/omcs/test/]', + # '@type': 'Source', + # 'activity': '/s/activity/omcs/vote', + # 'contributor': '/s/contributor/omcs/test'} + # ], + # 'start': {'@id': '/c/en/hello', + # '@type': 'Node', + # 'label': 'Hello', + # 'language': 'en', + # 'term': '/c/en/hello'}, + # 'surfaceText': '[[Hello]] is a kind of [[greeting]]', + # 'weight': 3.4641016151377544 + # }, ...] + + edges('สวัสดี', lang='th') + # output: + # [{ + # '@id': '/a/[/r/RelatedTo/,/c/th/สวัสดี/n/,/c/en/prosperity/]', + # '@type': 'Edge', + # 'dataset': '/d/wiktionary/en', + # 'end': {'@id': '/c/en/prosperity', + # '@type': 'Node', + # 'label': 'prosperity', + # 'language': 'en', + # 'term': '/c/en/prosperity'}, + # 'license': 'cc:by-sa/4.0', + # 'rel': { + # '@id': '/r/RelatedTo', '@type': 'Relation', + # 'label': 'RelatedTo'}, + # 'sources': [{ + # '@id': '/and/[/s/process/wikiparsec/2/,/s/resource/wiktionary/en/]', + # '@type': 'Source', + # 'contributor': '/s/resource/wiktionary/en', + # 'process': '/s/process/wikiparsec/2'}], + # 'start': {'@id': '/c/th/สวัสดี/n', + # '@type': 'Node', + # 'label': 'สวัสดี', + # 'language': 'th', + # 'sense_label': 'n', + # 'term': '/c/th/สวัสดี'}, + # 'surfaceText': None, + # 'weight': 1.0 + # }, ...] + """ + + obj = requests.get(f"https://api.conceptnet.io/c/{lang}/{word}", timeout=10).json() + return obj["edges"]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/core.html b/5.1/_modules/pythainlp/corpus/core.html new file mode 100644 index 0000000..af6966b --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/core.html @@ -0,0 +1,778 @@ + + + + + + + + pythainlp.corpus.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Corpus related functions.
+"""
+
+import json
+import os
+from typing import Union
+
+from pythainlp import __version__
+from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
+from pythainlp.tools import get_full_data_path
+
+_CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE")
+
+
+
+[docs] +def get_corpus_db(url: str): + """ + Get corpus catalog from server. + + :param str url: URL corpus catalog + """ + import requests + + corpus_db = None + try: + corpus_db = requests.get(url, timeout=10) + except requests.exceptions.HTTPError as http_err: + print(f"HTTP error occurred: {http_err}") + except requests.exceptions.RequestException as err: + print(f"Non-HTTP error occurred: {err}") + + return corpus_db
+ + + +
+[docs] +def get_corpus_db_detail(name: str, version: str = "") -> dict: + """ + Get details about a corpus, using information from local catalog. + + :param str name: name of corpus + :return: details about corpus + :rtype: dict + """ + with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: + local_db = json.load(f) + + if not version: + for corpus in local_db["_default"].values(): + if corpus["name"] == name: + return corpus + else: + for corpus in local_db["_default"].values(): + if corpus["name"] == name and corpus["version"] == version: + return corpus + + return {}
+ + + +def path_pythainlp_corpus(filename: str) -> str: + """ + Get path pythainlp.corpus data + + :param str filename: filename of the corpus to be read + + :return: : path of corpus + :rtype: str + """ + return os.path.join(corpus_path(), filename) + + +
+[docs] +def get_corpus(filename: str, comments: bool = True) -> frozenset: + """ + Read corpus data from file and return a frozenset. + + Each line in the file will be a member of the set. + + Whitespace stripped and empty values and duplicates removed. + + If comments is False, any text at any position after the character + '#' in each line will be discarded. + + :param str filename: filename of the corpus to be read + :param bool comments: keep comments + + :return: :class:`frozenset` consisting of lines in the file + :rtype: :class:`frozenset` + + :Example: + :: + + from pythainlp.corpus import get_corpus + + # input file (negations_th.txt): + # แต่ + # ไม่ + + get_corpus("negations_th.txt") + # output: + # frozenset({'แต่', 'ไม่'}) + + # input file (ttc_freq.txt): + # ตัวบท<tab>10 + # โดยนัยนี้<tab>1 + + get_corpus("ttc_freq.txt") + # output: + # frozenset({'โดยนัยนี้\\t1', + # 'ตัวบท\\t10', + # ...}) + + # input file (icubrk_th.txt): + # # Thai Dictionary for ICU BreakIterator + # กก + # กกขนาก + + get_corpus("icubrk_th.txt") + # output: + # frozenset({'กกขนาก', + # '# Thai Dictionary for ICU BreakIterator', + # 'กก', + # ...}) + + get_corpus("icubrk_th.txt", comments=False) + # output: + # frozenset({'กกขนาก', + # 'กก', + # ...}) + + """ + path = path_pythainlp_corpus(filename) + lines = [] + with open(path, "r", encoding="utf-8-sig") as fh: + lines = fh.read().splitlines() + + if not comments: + # if the line has a '#' character, take only text before the first '#' + lines = [line.split("#", 1)[0].strip() for line in lines] + + return frozenset(filter(None, lines))
+ + + +
+[docs] +def get_corpus_as_is(filename: str) -> list: + """ + Read corpus data from file, as it is, and return a list. + + Each line in the file will be a member of the list. + + No modifications in member values and their orders. + + If strip or comment removal is needed, use get_corpus() instead. + + :param str filename: filename of the corpus to be read + + :return: :class:`list` consisting of lines in the file + :rtype: :class:`list` + + :Example: + :: + + from pythainlp.corpus import get_corpus + + # input file (negations_th.txt): + # แต่ + # ไม่ + + get_corpus_as_is("negations_th.txt") + # output: + # ['แต่', 'ไม่'] + """ + path = path_pythainlp_corpus(filename) + lines = [] + with open(path, "r", encoding="utf-8-sig") as fh: + lines = fh.read().splitlines() + + return lines
+ + + +
+[docs] +def get_corpus_default_db(name: str, version: str = "") -> Union[str, None]: + """ + Get model path from default_db.json + + :param str name: corpus name + :return: path to the corpus or **None** if the corpus doesn't \ + exist on the device + :rtype: str + + If you want to edit default_db.json, \ + you can edit pythainlp/corpus/default_db.json + """ + default_db_path = path_pythainlp_corpus("default_db.json") + with open(default_db_path, encoding="utf-8-sig") as fh: + corpus_db = json.load(fh) + + if name in list(corpus_db.keys()): + if version in list(corpus_db[name]["versions"].keys()): + return path_pythainlp_corpus( + corpus_db[name]["versions"][version]["filename"] + ) + elif not version: # load latest version + version = corpus_db[name]["latest_version"] + return path_pythainlp_corpus( + corpus_db[name]["versions"][version]["filename"] + ) + + return None
+ + + +
+[docs] +def get_corpus_path( + name: str, version: str = "", force: bool = False +) -> Union[str, None]: + """ + Get corpus path. + + :param str name: corpus name + :param str version: version + :param bool force: force downloading + :return: path to the corpus or **None** if the corpus doesn't \ + exist on the device + :rtype: str + + :Example: + + (Please see the filename in + `this file + <https://pythainlp.org/pythainlp-corpus/db.json>`_ + + If the corpus already exists:: + + from pythainlp.corpus import get_corpus_path + + print(get_corpus_path('ttc')) + # output: /root/pythainlp-data/ttc_freq.txt + + If the corpus has not been downloaded yet:: + + from pythainlp.corpus import download, get_corpus_path + + print(get_corpus_path('wiki_lm_lstm')) + # output: None + + download('wiki_lm_lstm') + # output: + # Download: wiki_lm_lstm + # wiki_lm_lstm 0.32 + # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s] + # /root/pythainlp-data/thwiki_model_lstm.pth + + print(get_corpus_path('wiki_lm_lstm')) + # output: /root/pythainlp-data/thwiki_model_lstm.pth + """ + from typing import Dict + + CUSTOMIZE: Dict[str, str] = { + # "the corpus name":"path" + } + if name in list(CUSTOMIZE): + return CUSTOMIZE[name] + + default_path = get_corpus_default_db(name=name, version=version) + if default_path is not None: + return default_path + + # check if the corpus is in local catalog, download it if not + corpus_db_detail = get_corpus_db_detail(name, version=version) + + if not corpus_db_detail or not corpus_db_detail.get("filename"): + download(name, version=version, force=force) + corpus_db_detail = get_corpus_db_detail(name, version=version) + + if corpus_db_detail and corpus_db_detail.get("filename"): + # corpus is in the local catalog, get full path to the file + if corpus_db_detail.get("is_folder"): + path = get_full_data_path(corpus_db_detail.get("foldername")) + else: + path = get_full_data_path(corpus_db_detail.get("filename")) + # check if the corpus file actually exists, download it if not + if not os.path.exists(path): + download(name, version=version, force=force) + if os.path.exists(path): + return path + + return None
+ + + +def _download(url: str, dst: str) -> int: + """ + Download helper. + + @param: URL for downloading file + @param: dst place to put the file into + """ + CHUNK_SIZE = 64 * 1024 # 64 KiB + + from urllib.request import urlopen + + import requests + + file_size = int(urlopen(url).info().get("Content-Length", -1)) + r = requests.get(url, stream=True, timeout=10) + with open(get_full_data_path(dst), "wb") as f: + pbar = None + try: + from tqdm.auto import tqdm + + pbar = tqdm(total=int(r.headers["Content-Length"])) + except ImportError: + pbar = None + + for chunk in r.iter_content(chunk_size=CHUNK_SIZE): + if chunk: + f.write(chunk) + if pbar: + pbar.update(len(chunk)) + if pbar: + pbar.close() + else: + print("Done.") + return file_size + + +def _check_hash(dst: str, md5: str) -> None: + """ + Check hash helper. + + @param: dst place to put the file into + @param: md5 place to file hash (MD5) + """ + if md5 and md5 != "-": + import hashlib + + with open(get_full_data_path(dst), "rb") as f: + content = f.read() + file_md5 = hashlib.md5(content).hexdigest() + + if md5 != file_md5: + raise ValueError("Hash does not match expected.") + + +def _version2int(v: str) -> int: + """ + X.X.X => X0X0X + """ + if "-" in v: + v = v.split("-")[0] + if v.endswith(".*"): + v = v.replace(".*", ".0") # X.X.* => X.X.0 + v_list = v.split(".") + if len(v_list) < 3: + v_list.append("0") + v_new = "" + for i, value in enumerate(v_list): + if i != 0: + if len(value) < 2: + v_new += "0" + value + else: + v_new += value + else: + v_new += value + return int(v_new) + + +def _check_version(cause: str) -> bool: + temp = cause + check = False + __version = __version__ + if "dev" in __version: + __version = __version.split("dev", maxsplit=1)[0] + elif "beta" in __version: + __version = __version.split("beta", maxsplit=1)[0] + v = _version2int(__version) + + if cause == "*": + check = True + elif cause.startswith("==") and ">" not in cause and "<" not in cause: + temp = cause.replace("==", "") + check = v == _version2int(temp) + elif cause.startswith(">=") and "<" not in cause: + temp = cause.replace(">=", "") + check = v >= _version2int(temp) + elif cause.startswith(">") and "<" not in cause: + temp = cause.replace(">", "") + check = v > _version2int(temp) + elif cause.startswith(">=") and "<=" not in cause and "<" in cause: + temp = cause.replace(">=", "").split("<") + check = _version2int(temp[0]) <= v < _version2int(temp[1]) + elif cause.startswith(">=") and "<=" in cause: + temp = cause.replace(">=", "").split("<=") + check = _version2int(temp[0]) <= v <= _version2int(temp[1]) + elif cause.startswith(">") and "<" in cause: + temp = cause.replace(">", "").split("<") + check = _version2int(temp[0]) < v < _version2int(temp[1]) + elif cause.startswith("<="): + temp = cause.replace("<=", "") + check = v <= _version2int(temp[0]) + elif cause.startswith("<"): + temp = cause.replace("<", "") + check = v < _version2int(temp[0]) + + return check + + +
+[docs] +def download( + name: str, force: bool = False, url: str = "", version: str = "" +) -> bool: + """ + Download corpus. + + The available corpus names can be seen in this file: + https://pythainlp.org/pythainlp-corpus/db.json + + :param str name: corpus name + :param bool force: force downloading + :param str url: URL of the corpus catalog + :param str version: version of the corpus + :return: **True** if the corpus is found and successfully downloaded. + Otherwise, it returns **False**. + :rtype: bool + + :Example: + :: + + from pythainlp.corpus import download + + download("wiki_lm_lstm", force=True) + # output: + # Corpus: wiki_lm_lstm + # - Downloading: wiki_lm_lstm 0.1 + # thwiki_lm.pth: 26%|██▌ | 114k/434k [00:00<00:00, 690kB/s] + + By default, downloaded corpora and models will be saved in + ``$HOME/pythainlp-data/`` + (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``). + """ + if _CHECK_MODE == "1": + print("PyThaiNLP is read-only mode. It can't download.") + return False + if not url: + url = corpus_db_url() + + corpus_db = get_corpus_db(url) + if not corpus_db: + print(f"Cannot download corpus catalog from: {url}") + return False + + corpus_db = corpus_db.json() + + # check if corpus is available + if name in corpus_db: + with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: + local_db = json.load(f) + + corpus = corpus_db[name] + print("Corpus:", name) + if not version: + for v, file in corpus["versions"].items(): + if _check_version(file["pythainlp_version"]): + version = v + + # version may still be None here + if version not in corpus["versions"]: + print("Corpus not found.") + return False + elif ( + _check_version(corpus["versions"][version]["pythainlp_version"]) + is False + ): + print("Corpus version not supported.") + return False + corpus_versions = corpus["versions"][version] + file_name = corpus_versions["filename"] + found = "" + for i, item in local_db["_default"].items(): + # Do not check version here + if item["name"] == name: + # Record corpus no. if found in local database + found = i + break + + # If not found in local, download it + if force or not found: + print(f"- Downloading: {name} {version}") + _download( + corpus_versions["download_url"], + file_name, + ) + _check_hash( + file_name, + corpus_versions["md5"], + ) + + is_folder = False + foldername = None + + if corpus_versions["is_tar_gz"] == "True": + import tarfile + + is_folder = True + foldername = name + "_" + str(version) + if not os.path.exists(get_full_data_path(foldername)): + os.mkdir(get_full_data_path(foldername)) + with tarfile.open(get_full_data_path(file_name)) as tar: + tar.extractall(path=get_full_data_path(foldername)) + elif corpus_versions["is_zip"] == "True": + import zipfile + + is_folder = True + foldername = name + "_" + str(version) + if not os.path.exists(get_full_data_path(foldername)): + os.mkdir(get_full_data_path(foldername)) + with zipfile.ZipFile( + get_full_data_path(file_name), "r" + ) as zip_file: + zip_file.extractall(path=get_full_data_path(foldername)) + + if found: + local_db["_default"][found]["version"] = version + local_db["_default"][found]["filename"] = file_name + local_db["_default"][found]["is_folder"] = is_folder + local_db["_default"][found]["foldername"] = foldername + else: + # This awkward behavior is for backward-compatibility with + # database files generated previously using TinyDB + if local_db["_default"]: + corpus_no = ( + max((int(no) for no in local_db["_default"])) + 1 + ) + else: + corpus_no = 1 + local_db["_default"][str(corpus_no)] = { + "name": name, + "version": version, + "filename": file_name, + "is_folder": is_folder, + "foldername": foldername, + } + + with open(corpus_db_path(), "w", encoding="utf-8") as f: + json.dump(local_db, f, ensure_ascii=False) + # Check if versions match or if the corpus is found in local database + # but a re-download is not forced + else: + current_ver = local_db["_default"][found]["version"] + + if current_ver == version: + # Corpus of the same version already exists + print("- Already up to date.") + else: + # Corpus exists but is of different version + print(f"- Existing version: {current_ver}") + print(f"- New version available: {version}") + print("- Use download(data_name, force=True) to update") + + return True + + print("Corpus not found:", name) + return False
+ + + +
+[docs] +def remove(name: str) -> bool: + """ + Remove corpus + + :param str name: corpus name + :return: **True** if the corpus is found and successfully removed. + Otherwise, it returns **False**. + :rtype: bool + + :Example: + :: + + from pythainlp.corpus import remove, get_corpus_path, get_corpus + + print(remove("ttc")) + # output: True + + print(get_corpus_path("ttc")) + # output: None + + get_corpus("ttc") + # output: + # FileNotFoundError: [Errno 2] No such file or directory: + # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc' + """ + if _CHECK_MODE == "1": + print("PyThaiNLP is read-only mode. It can't remove corpus.") + return False + with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: + db = json.load(f) + data = [ + corpus for corpus in db["_default"].values() if corpus["name"] == name + ] + + if data: + path = get_corpus_path(name) + if data[0].get("is_folder"): + import shutil + + os.remove(get_full_data_path(data[0].get("filename"))) + shutil.rmtree(path, ignore_errors=True) + else: + os.remove(path) + for i, corpus in db["_default"].copy().items(): + if corpus["name"] == name: + del db["_default"][i] + with open(corpus_db_path(), "w", encoding="utf-8") as f: + json.dump(db, f, ensure_ascii=False) + return True + + return False
+ + + +def get_path_folder_corpus(name, version, *path): + return os.path.join(get_corpus_path(name, version), *path) +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/oscar.html b/5.1/_modules/pythainlp/corpus/oscar.html new file mode 100644 index 0000000..e32adfe --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/oscar.html @@ -0,0 +1,210 @@ + + + + + + + + pythainlp.corpus.oscar — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.oscar

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai unigram word frequency from OSCAR Corpus (words tokenized using ICU)
+
+Credit: Korakot Chaovavanich
+https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
+"""
+
+__all__ = ["word_freqs", "unigram_word_freqs"]
+
+from collections import defaultdict
+from typing import List, Tuple
+
+from pythainlp.corpus import get_corpus_path
+
+_OSCAR_FILENAME = "oscar_icu"
+
+
+
+[docs] +def word_freqs() -> List[Tuple[str, int]]: + """ + Get word frequency from OSCAR Corpus (words tokenized using ICU) + """ + freqs: list[tuple[str, int]] = [] + path = get_corpus_path(_OSCAR_FILENAME) + if not path: + return freqs + path = str(path) + + with open(path, "r", encoding="utf-8-sig") as f: + lines = list(f.readlines()) + del lines[0] + for line in lines: + temp = line.strip().split(",") + if len(temp) >= 2: + if temp[0] != " " and '"' not in temp[0]: + freqs.append((temp[0], int(temp[1]))) + elif temp[0] == " ": + freqs.append(("<s/>", int(temp[1]))) + + return freqs
+ + + +
+[docs] +def unigram_word_freqs() -> dict[str, int]: + """ + Get unigram word frequency from OSCAR Corpus (words tokenized using ICU) + """ + freqs: dict[str, int] = defaultdict(int) + path = get_corpus_path(_OSCAR_FILENAME) + if not path: + return freqs + path = str(path) + + with open(path, "r", encoding="utf-8-sig") as fh: + lines = list(fh.readlines()) + del lines[0] + for i in lines: + temp = i.strip().split(",") + if temp[0] != " " and '"' not in temp[0]: + freqs[temp[0]] = int(temp[-1]) + elif temp[0] == " ": + freqs["<s/>"] = int(temp[-1]) + + return freqs
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/th_en_translit.html b/5.1/_modules/pythainlp/corpus/th_en_translit.html new file mode 100644 index 0000000..5971a1f --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/th_en_translit.html @@ -0,0 +1,215 @@ + + + + + + + + pythainlp.corpus.th_en_translit — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.corpus.th_en_translit

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai-English Transliteration Dictionary v1.4
+
+Wannaphong Phatthiyaphaibun. (2022).
+wannaphong/thai-english-transliteration-dictionary: v1.4 (v1.4).
+Zenodo. https://doi.org/10.5281/zenodo.6716672
+"""
+
+__all__ = [
+    "get_transliteration_dict",
+    "TRANSLITERATE_EN",
+    "TRANSLITERATE_FOLLOW_RTSG",
+]
+
+from collections import defaultdict
+
+from pythainlp.corpus import path_pythainlp_corpus
+
+_FILE_NAME = "th_en_transliteration_v1.4.tsv"
+TRANSLITERATE_EN = "en"
+TRANSLITERATE_FOLLOW_RTSG = "follow_rtsg"
+
+
+
+[docs] +def get_transliteration_dict() -> defaultdict: + """ + Get Thai to English transliteration dictionary. + + The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format. + """ + path = path_pythainlp_corpus(_FILE_NAME) + if not path: + raise FileNotFoundError( + f"Unable to load transliteration dictionary. " + f"{_FILE_NAME} is not found under pythainlp/corpus." + ) + + # use list, as one word can have multiple transliterations. + trans_dict: defaultdict[str, dict[str, list]] = defaultdict( + lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []} + ) + try: + with open(path, "r", encoding="utf-8") as f: + # assume that the first row contains column names, so skip it. + for line in f.readlines()[1:]: + stripped = line.strip() + if stripped: + th, *en_checked = stripped.split("\t") + # replace in-between whitespace to prevent mismatched results from different tokenizers. + # e.g. "บอยแบนด์" + # route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband" + # route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band" + en_translit = en_checked[0].replace(" ", "") + trans_dict[th][TRANSLITERATE_EN].append(en_translit) + en_follow_rtgs = ( + bool(en_checked[1]) if len(en_checked) == 2 else None + ) + trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append( + en_follow_rtgs + ) + + except ValueError as exc: + raise ValueError( + f"Unable to parse {_FILE_NAME}. " + f"Make sure it is a 3-column tab-separated file with header." + ) from exc + else: + return trans_dict
+ + + +TRANSLITERATE_DICT = get_transliteration_dict() +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/tnc.html b/5.1/_modules/pythainlp/corpus/tnc.html new file mode 100644 index 0000000..f2a409d --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/tnc.html @@ -0,0 +1,240 @@ + + + + + + + + pythainlp.corpus.tnc — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.tnc

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai National Corpus word frequency
+"""
+
+__all__ = [
+    "bigram_word_freqs",
+    "trigram_word_freqs",
+    "unigram_word_freqs",
+    "word_freqs",
+]
+
+from collections import defaultdict
+from typing import List, Tuple
+
+from pythainlp.corpus import get_corpus, get_corpus_path
+
+_UNIGRAM_FILENAME = "tnc_freq.txt"
+_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
+_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"
+
+
+
+[docs] +def word_freqs() -> List[Tuple[str, int]]: + """ + Get word frequency from Thai National Corpus (TNC) + \n(See: `dev/pythainlp/corpus/tnc_freq.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_) + + Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445 + """ + freqs: list[tuple[str, int]] = [] + lines = list(get_corpus(_UNIGRAM_FILENAME)) + for line in lines: + word_freq = line.split("\t") + if len(word_freq) >= 2: + freqs.append((word_freq[0], int(word_freq[1]))) + + return freqs
+ + + +
+[docs] +def unigram_word_freqs() -> dict[str, int]: + """ + Get unigram word frequency from Thai National Corpus (TNC) + """ + freqs: dict[str, int] = defaultdict(int) + lines = list(get_corpus(_UNIGRAM_FILENAME)) + for i in lines: + _temp = i.strip().split(" ") + if len(_temp) >= 2: + freqs[_temp[0]] = int(_temp[-1]) + + return freqs
+ + + +
+[docs] +def bigram_word_freqs() -> dict[Tuple[str, str], int]: + """ + Get bigram word frequency from Thai National Corpus (TNC) + """ + freqs: dict[tuple[str, str], int] = defaultdict(int) + path = get_corpus_path(_BIGRAM_CORPUS_NAME) + if not path: + return freqs + path = str(path) + + with open(path, "r", encoding="utf-8-sig") as fh: + for i in fh.readlines(): + temp = i.strip().split(" ") + freqs[(temp[0], temp[1])] = int(temp[-1]) + + return freqs
+ + + +
+[docs] +def trigram_word_freqs() -> dict[Tuple[str, str, str], int]: + """ + Get trigram word frequency from Thai National Corpus (TNC) + """ + freqs: dict[tuple[str, str, str], int] = defaultdict(int) + path = get_corpus_path(_TRIGRAM_CORPUS_NAME) + if not path: + return freqs + path = str(path) + + with open(path, "r", encoding="utf-8-sig") as fh: + for i in fh.readlines(): + temp = i.strip().split(" ") + freqs[(temp[0], temp[1], temp[2])] = int(temp[-1]) + + return freqs
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/ttc.html b/5.1/_modules/pythainlp/corpus/ttc.html new file mode 100644 index 0000000..90982b0 --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/ttc.html @@ -0,0 +1,194 @@ + + + + + + + + pythainlp.corpus.ttc — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.ttc

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai Textbook Corpus (TTC) word frequency
+
+Credit: Korakot Chaovavanich
+https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
+"""
+
+__all__ = ["word_freqs", "unigram_word_freqs"]
+
+from collections import defaultdict
+from typing import List, Tuple
+
+from pythainlp.corpus import get_corpus
+
+_UNIGRAM_FILENAME = "ttc_freq.txt"
+
+
+
+[docs] +def word_freqs() -> List[Tuple[str, int]]: + """ + Get word frequency from Thai Textbook Corpus (TTC) + \n(See: `dev/pythainlp/corpus/ttc_freq.txt\ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/ttc_freq.txt>`_) + """ + freqs: list[tuple[str, int]] = [] + lines = list(get_corpus(_UNIGRAM_FILENAME)) + for line in lines: + word_freq = line.split("\t") + if len(word_freq) >= 2: + freqs.append((word_freq[0], int(word_freq[1]))) + + return freqs
+ + + +
+[docs] +def unigram_word_freqs() -> dict[str, int]: + """ + Get unigram word frequency from Thai Textbook Corpus (TTC) + """ + freqs: dict[str, int] = defaultdict(int) + + lines = list(get_corpus(_UNIGRAM_FILENAME)) + for i in lines: + temp = i.strip().split(" ") + if len(temp) >= 2: + freqs[temp[0]] = int(temp[-1]) + + return freqs
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/util.html b/5.1/_modules/pythainlp/corpus/util.html new file mode 100644 index 0000000..e7b5684 --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/util.html @@ -0,0 +1,291 @@ + + + + + + + + pythainlp.corpus.util — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.util

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tool for creating word lists
+codes are from Korakot Chaovavanich.
+
+:See also:
+    * `Facebook post \
+        <https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_
+    * `Google Colab \
+        <https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_
+"""
+
+from collections import Counter
+from typing import Callable, Iterable, Iterator, List, Set, Tuple
+
+from pythainlp.corpus import thai_words
+from pythainlp.tokenize import newmm
+from pythainlp.util import Trie
+
+
+def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
+    """
+    Return beginning and ending indexes of word pairs
+    """
+    i = 0
+    for w in words:
+        yield i, i + len(w)
+        i += len(w)
+
+
+
+[docs] +def find_badwords( + tokenize: Callable[[str], List[str]], + training_data: Iterable[Iterable[str]], +) -> Set[str]: + """ + Find words that do not work well with the `tokenize` function + for the provided `training_data`. + + :param Callable[[str], List[str]] tokenize: a tokenize function + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ + as a training set + :return: words that are considered to make `tokenize` perform badly + :rtype: Set[str] + """ + right = Counter() + wrong = Counter() + + for train_words in training_data: + train_set = set(index_pairs(train_words)) + test_words = tokenize("".join(train_words)) + test_pairs = index_pairs(test_words) + for w, p in zip(test_words, test_pairs): + if p in train_set: + right[w] += 1 + else: + wrong[w] += 1 + + # if wrong is more than right, then it's a bad word + bad_words = [] + for w, count in wrong.items(): + if count > right[w]: + bad_words.append(w) + + return set(bad_words)
+ + + +
+[docs] +def revise_wordset( + tokenize: Callable[[str], List[str]], + orig_words: Iterable[str], + training_data: Iterable[Iterable[str]], +) -> Set[str]: + """ + Revise a set of words that could improve tokenization performance of + a dictionary-based `tokenize` function. + + `orig_words` will be used as a base set for the dictionary. + Words that do not performed well with `training_data` will be removed. + The remaining words will be returned. + + :param Callable[[str], List[str]] tokenize: a tokenize function, can be\ + any function that takes a string as input and returns a List[str] + :param Iterable[str] orig_words: words that used by the tokenize function,\ + will be used as a base for revision + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ + as a training set + :return: words that are considered to make `tokenize` perform badly + :rtype: Set[str] + + :Example:: + :: + + from pythainlp.corpus import thai_words + from pythainlp.corpus.util import revise_wordset + from pythainlp.tokenize.longest import segment + + base_words = thai_words() + more_words = { + "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง" + } + base_words = base_words.union(more_words) + dict_trie = Trie(wordlist) + + tokenize = lambda text: segment(text, dict_trie) + + training_data = [ + [str, str, str. ...], + [str, str, str, str, ...], + ... + ] + + revised_words = revise_wordset(tokenize, wordlist, training_data) + """ + bad_words = find_badwords(tokenize, training_data) + return set(orig_words) - bad_words
+ + + +
+[docs] +def revise_newmm_default_wordset( + training_data: Iterable[Iterable[str]], +) -> Set[str]: + """ + Revise a set of word that could improve tokenization performance of + `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default + tokenizer for PyThaiNLP. + + Words from `pythainlp.corpus.thai_words()` will be used as a base set + for the dictionary. Words that do not performed well with `training_data` + will be removed. The remaining words will be returned. + + :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ + as a training set + :return: words that are considered to make `tokenize` perform badly + :rtype: Set[str] + """ + orig_words = thai_words() + trie = Trie(orig_words) + + def tokenize(text): + return newmm.segment(text, custom_dict=trie) + + revised_words = revise_wordset(tokenize, orig_words, training_data) + return revised_words
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/corpus/wordnet.html b/5.1/_modules/pythainlp/corpus/wordnet.html new file mode 100644 index 0000000..3eb1d63 --- /dev/null +++ b/5.1/_modules/pythainlp/corpus/wordnet.html @@ -0,0 +1,620 @@ + + + + + + + + pythainlp.corpus.wordnet — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.corpus.wordnet

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+NLTK WordNet wrapper
+
+API here is exactly the same as NLTK WordNet API,
+except that the lang (language) argument is "tha" (Thai) by default.
+
+For more on usage, see NLTK Howto:
+https://www.nltk.org/howto/wordnet.html
+"""
+import nltk
+
+try:
+    nltk.data.find("corpora/omw")
+except LookupError:
+    nltk.download("omw")
+
+try:
+    nltk.data.find("corpora/wordnet")
+except LookupError:
+    nltk.download("wordnet")
+
+from nltk.corpus import wordnet
+
+
+
+[docs] +def synsets(word: str, pos: str = None, lang: str = "tha"): + """ + This function returns the synonym set for all lemmas of the given word + with an optional argument to constrain the part of speech of the word. + + :param str word: word to find synsets of + :param str pos: constraint of the part of speech (i.e. *n* for Noun, *v* + for Verb, *a* for Adjective, *s* for Adjective + satellites, and *r* for Adverb) + :param str lang: abbreviation of language (i.e. *eng*, *tha*). + By default, it is *tha* + + :return: :class:`Synset` all lemmas of the word constrained with + the argument *pos*. + :rtype: list[:class:`Synset`] + + :Example: + + >>> from pythainlp.corpus.wordnet import synsets + >>> + >>> synsets("ทำงาน") + [Synset('function.v.01'), Synset('work.v.02'), + Synset('work.v.01'), Synset('work.v.08')] + >>> + >>> synsets("บ้าน", lang="tha")) + [Synset('duplex_house.n.01'), Synset('dwelling.n.01'), + Synset('house.n.01'), Synset('family.n.01'), Synset('home.n.03'), + Synset('base.n.14'), Synset('home.n.01'), + Synset('houseful.n.01'), Synset('home.n.07')] + + When specifying the constraint of the part of speech. For example, + the word "แรง" could be interpreted as force (n.) or hard (adj.). + + >>> from pythainlp.corpus.wordnet import synsets + >>> # By default, allow all parts of speech + >>> synsets("แรง", lang="tha") + >>> + >>> # only Noun + >>> synsets("แรง", pos="n", lang="tha") + [Synset('force.n.03'), Synset('force.n.02')] + >>> + >>> # only Adjective + >>> synsets("แรง", pos="a", lang="tha") + [Synset('hard.s.10'), Synset('strong.s.02')] + """ + return wordnet.synsets(lemma=word, pos=pos, lang=lang)
+ + + +
+[docs] +def synset(name_synsets): + """ + This function returns the synonym set (synset) given the name of the synset + (i.e. 'dog.n.01', 'chase.v.01'). + + :param str name_synsets: name of the synset + + :return: :class:`Synset` of the given name + :rtype: :class:`Synset` + + :Example: + + >>> from pythainlp.corpus.wordnet import synset + >>> + >>> difficult = synset('difficult.a.01') + >>> difficult + Synset('difficult.a.01') + >>> + >>> difficult.definition() + 'not easy; requiring great physical or mental effort to accomplish + or comprehend or endure' + """ + return wordnet.synset(name_synsets)
+ + + +
+[docs] +def all_lemma_names(pos: str = None, lang: str = "tha"): + """ + This function returns all lemma names for all synsets of the given + part of speech tag and language. If part of speech tag is not + specified, all synsets of all parts of speech will be used. + + :param str pos: constraint of the part of speech (i.e. *n* for Noun, + *v* for Verb, *a* for Adjective, *s* for + Adjective satellites, and *r* for Adverb). + By default, *pos* is **None**. + :param str lang: abbreviation of language (i.e. *eng*, *tha*). + By default, it is *tha*. + + :return: :class:`Synset` of lemmas names given the POS and language + :rtype: list[:class:`Synset`] + + :Example: + + >>> from pythainlp.corpus.wordnet import all_lemma_names + >>> + >>> all_lemma_names() + ['อเมริโก_เวสปุชชี', + 'เมืองชีย์เอนเน', + 'การรับเลี้ยงบุตรบุญธรรม', + 'ผู้กัด', + 'ตกแต่งเรือด้วยธง', + 'จิโอวานนิ_เวอร์จินิโอ',...] + >>> + >>> len(all_lemma_names()) + 80508 + >>> + >>> all_lemma_names(pos="a") + ['ซึ่งไม่มีแอลกอฮอล์', + 'ซึ่งตรงไปตรงมา', + 'ที่เส้นศูนย์สูตร', + 'ทางจิตใจ',...] + >>> + >>> len(all_lemma_names(pos="a")) + 5277 + """ + return wordnet.all_lemma_names(pos=pos, lang=lang)
+ + + +
+[docs] +def all_synsets(pos: str = None): + """ + This function iterates over all synsets constrained by the given + part of speech tag. + + :param str pos: part of speech tag + + :return: list of synsets constrained by the given part of speech tag. + :rtype: Iterable[:class:`Synset`] + + :Example: + + >>> from pythainlp.corpus.wordnet import all_synsets + >>> + >>> generator = all_synsets(pos="n") + >>> next(generator) + Synset('entity.n.01') + >>> next(generator) + Synset('physical_entity.n.01') + >>> next(generator) + Synset('abstraction.n.06') + >>> + >>> generator = all_synsets() + >>> next(generator) + Synset('able.a.01') + >>> next(generator) + Synset('unable.a.01') + """ + return wordnet.all_synsets(pos=pos)
+ + + +
+[docs] +def langs(): + """ + This function returns a set of ISO-639 language codes. + + :return: ISO-639 language codes + :rtype: list[str] + + :Example: + >>> from pythainlp.corpus.wordnet import langs + >>> langs() + ['eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan', + 'ell', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', + 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno', 'nob', + 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', + 'zsm'] + """ + return wordnet.langs()
+ + + +
+[docs] +def lemmas(word: str, pos: str = None, lang: str = "tha"): + """ + This function returns all lemmas given the word with an optional + argument to constrain the part of speech of the word. + + :param str word: word to find lemmas of + :param str pos: constraint of the part of speech (i.e. *n* for Noun, + *v* for Verb, *a* for Adjective, *s* for + Adjective satellites, and *r* for Adverb) + :param str lang: abbreviation of language (i.e. *eng*, *tha*). + By default, it is *tha*. + + :return: :class:`Synset` of all lemmas of the word constrained + by the argument *pos*. + :rtype: list[:class:`Lemma`] + + :Example: + + >>> from pythainlp.corpus.wordnet import lemmas + >>> + >>> lemmas("โปรด") + [Lemma('like.v.03.โปรด'), Lemma('like.v.02.โปรด')] + + >>> print(lemmas("พระเจ้า")) + [Lemma('god.n.01.พระเจ้า'), Lemma('godhead.n.01.พระเจ้า'), + Lemma('father.n.06.พระเจ้า'), Lemma('god.n.03.พระเจ้า')] + + When the part of speech tag is specified: + + >>> from pythainlp.corpus.wordnet import lemmas + >>> + >>> lemmas("ม้วน") + [Lemma('roll.v.18.ม้วน'), Lemma('roll.v.17.ม้วน'), + Lemma('roll.v.08.ม้วน'), Lemma('curl.v.01.ม้วน'), + Lemma('roll_up.v.01.ม้วน'), Lemma('wind.v.03.ม้วน'), + Lemma('roll.n.11.ม้วน')] + >>> + >>> # only lemmas with Noun as the part of speech + >>> lemmas("ม้วน", pos="n") + [Lemma('roll.n.11.ม้วน')] + """ + return wordnet.lemmas(word, pos=pos, lang=lang)
+ + + +
+[docs] +def lemma(name_synsets): + """ + This function returns lemma object given the name. + + .. note:: + Support only English language (*eng*). + + :param str name_synsets: name of the synset + + :return: lemma object with the given name + :rtype: :class:`Lemma` + + :Example: + + >>> from pythainlp.corpus.wordnet import lemma + >>> + >>> lemma('practice.v.01.exercise') + Lemma('practice.v.01.exercise') + >>> + >>> lemma('drill.v.03.exercise') + Lemma('drill.v.03.exercise') + >>> + >>> lemma('exercise.n.01.exercise') + Lemma('exercise.n.01.exercise') + """ + return wordnet.lemma(name_synsets)
+ + + +
+[docs] +def lemma_from_key(key): + """ + This function returns lemma object given the lemma key. + This is similar to :func:`lemma` but it needs to be given the key + of lemma instead of the name of lemma. + + .. note:: + Support only English language (*eng*). + + :param str key: key of the lemma object + + :return: lemma object with the given key + :rtype: :class:`Lemma` + + :Example: + + >>> from pythainlp.corpus.wordnet import lemma, lemma_from_key + >>> + >>> practice = lemma('practice.v.01.exercise') + >>> practice.key() + exercise%2:41:00:: + >>> lemma_from_key(practice.key()) + Lemma('practice.v.01.exercise') + """ + return wordnet.lemma_from_key(key)
+ + + +
+[docs] +def path_similarity(synsets1, synsets2): + """ + This function returns similarity between two synsets based on the + shortest path distance calculated using the equation below. + + .. math:: + + path\\_similarity = {1 \\over shortest\\_path\\_distance(synsets1, + synsets2) + 1} + + The shortest path distance is calculated by the connection through + the is-a (hypernym/hyponym) taxonomy. The score is in the range of + 0 to 1. Path similarity of 1 indicates identicality. + + :param `Synset` synsets1: first synset supplied to measures + the path similarity with + :param `Synset` synsets2: second synset supplied to measures + the path similarity with + + :return: path similarity between two synsets + :rtype: float + + :Example: + + >>> from pythainlp.corpus.wordnet import path_similarity, synset + >>> + >>> entity = synset('entity.n.01') + >>> obj = synset('object.n.01') + >>> cat = synset('cat.n.01') + >>> + >>> path_similarity(entity, obj) + 0.3333333333333333 + >>> path_similarity(entity, cat) + 0.07142857142857142 + >>> path_similarity(obj, cat) + 0.08333333333333333 + """ + return wordnet.path_similarity(synsets1, synsets2)
+ + + +
+[docs] +def lch_similarity(synsets1, synsets2): + """ + This function returns Leacock Chodorow similarity (LCH) + between two synsets, based on the shortest path distance + and the maximum depth of the taxonomy. The equation to + calculate LCH similarity is shown below: + + .. math:: + + lch\\_similarity = {-log(shortest\\_path\\_distance(synsets1, + synsets2) \\over 2 * taxonomy\\_depth} + + :param `Synset` synsets1: first synset supplied to measures + the LCH similarity + :param `Synset` synsets2: second synset supplied to measures + the LCH similarity + + :return: LCH similarity between two synsets + :rtype: float + + :Example: + + >>> from pythainlp.corpus.wordnet import lch_similarity, synset + >>> + >>> entity = synset('entity.n.01') + >>> obj = synset('object.n.01') + >>> cat = synset('cat.n.01') + >>> + >>> lch_similarity(entity, obj) + 2.538973871058276 + >>> lch_similarity(entity, cat) + 0.9985288301111273 + >>> lch_similarity(obj, cat) + 1.1526795099383855 + """ + return wordnet.lch_similarity(synsets1, synsets2)
+ + + +
+[docs] +def wup_similarity(synsets1, synsets2): + """ + This function returns Wu-Palmer similarity (WUP) between two synsets, + based on the depth of the two senses in the taxonomy and their + Least Common Subsumer (most specific ancestor node). + + :param `Synset` synsets1: first synset supplied to measures + the WUP similarity with + :param `Synset` synsets2: second synset supplied to measures + the WUP similarity with + + :return: WUP similarity between two synsets + :rtype: float + + :Example: + + >>> from pythainlp.corpus.wordnet import wup_similarity, synset + >>> + >>> entity = synset('entity.n.01') + >>> obj = synset('object.n.01') + >>> cat = synset('cat.n.01') + >>> + >>> wup_similarity(entity, obj) + 0.5 + >>> wup_similarity(entity, cat) + 0.13333333333333333 + >>> wup_similarity(obj, cat) + 0.35294117647058826 + """ + return wordnet.wup_similarity(synsets1, synsets2)
+ + + +
+[docs] +def morphy(form, pos: str = None): + """ + This function finds a possible base form for the given form, + with the given part of speech. + + :param str form: the form to finds the base form of + :param str pos: part of speech tag of words to be searched + + :return: base form of the given form + :rtype: str + + :Example: + + >>> from pythainlp.corpus.wordnet import morphy + >>> + >>> morphy("dogs") + 'dogs' + >>> + >>> morphy("thieves") + 'thief' + >>> + >>> morphy("mixed") + 'mix' + >>> + >>> morphy("calculated") + 'calculate' + """ + return wordnet.morphy(form, pos=None)
+ + + +
+[docs] +def custom_lemmas(tab_file, lang: str): + """ + This function reads a custom tab file + (see: http://compling.hss.ntu.edu.sg/omw/) + containing mappings of lemmas in the given language. + + :param tab_file: Tab file as a file or file-like object + :param str lang: abbreviation of language (i.e. *eng*, *tha*). + """ + return wordnet.custom_lemmas(tab_file, lang)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/el/core.html b/5.1/_modules/pythainlp/el/core.html new file mode 100644 index 0000000..330a2d2 --- /dev/null +++ b/5.1/_modules/pythainlp/el/core.html @@ -0,0 +1,197 @@ + + + + + + + + pythainlp.el.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.el.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Union
+
+
+
+[docs] +class EntityLinker: +
+[docs] + def __init__(self, model_name:str="bela", device:str="cuda", tag:str="wikidata"): + """ + EntityLinker + + :param str model_name: model name (bela) + :param str device: device for running model on + :param str tag: Entity linking tag (wikidata) + + You can read about bela model at `https://github.com/PyThaiNLP/MultiEL \ + <https://github.com/PyThaiNLP/MultiEL>`_. + """ + self.model_name = model_name + self.device = device + self.tag = tag + if self.model_name not in ["bela"]: + raise NotImplementedError(f"EntityLinker doesn't support {model_name} model.") + if self.tag not in ["wikidata"]: + raise NotImplementedError(f"EntityLinker doesn't support {tag} tag.") + from pythainlp.el._multiel import MultiEL + self.model = MultiEL(model_name=self.model_name, device=self.device)
+ +
+[docs] + def get_el(self, list_text:Union[List[str], str])->Union[List[dict], str]: + """ + Get Entity Linking from Thai Text + + :param str Union[List[str], str]: list of Thai text or text + :return: list of entity linking + :rtype: Union[List[dict], str] + + :Example: + :: + + from pythainlp.el import EntityLinker + + el = EntityLinker(device="cuda") + print(el.get_el("จ๊อบเคยเป็นซีอีโอบริษัทแอปเปิล")) + # output: [{'offsets': [11, 23], + # 'lengths': [6, 7], + # 'entities': ['Q484876', 'Q312'], + # 'md_scores': [0.30301809310913086, 0.6399497389793396], + # 'el_scores': [0.7142490744590759, 0.8657019734382629]}] + """ + return self.model.process_batch(list_text)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/generate/core.html b/5.1/_modules/pythainlp/generate/core.html new file mode 100644 index 0000000..ef42dbb --- /dev/null +++ b/5.1/_modules/pythainlp/generate/core.html @@ -0,0 +1,466 @@ + + + + + + + + pythainlp.generate.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.generate.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Text generator using n-gram language model
+
+codes are from
+https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
+"""
+
+import random
+from typing import List, Union
+
+from pythainlp.corpus.oscar import (
+    unigram_word_freqs as oscar_word_freqs_unigram,
+)
+from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
+from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
+from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
+from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
+
+
+
+[docs] +class Unigram: + """ + Text generator using Unigram + + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + * *ttc* - Thai Textbook Corpus (TTC) + * *oscar* - OSCAR Corpus + """ + +
+[docs] + def __init__(self, name: str = "tnc"): + if name == "tnc": + self.counts = tnc_word_freqs_unigram() + elif name == "ttc": + self.counts = ttc_word_freqs_unigram() + elif name == "oscar": + self.counts = oscar_word_freqs_unigram() + self.word = list(self.counts.keys()) + self.n = 0 + for i in self.word: + self.n += self.counts[i] + self.prob = {i: self.counts[i] / self.n for i in self.word} + self._word_prob: dict = {}
+ + +
+[docs] + def gen_sentence( + self, + start_seq: str = "", + N: int = 3, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False, + ) -> Union[List[str], str]: + """ + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence + + :return: list of words or a word string + :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Unigram + + gen = Unigram() + + gen.gen_sentence("แมว") + # output: 'แมวเวลานะนั้น' + """ + if not start_seq: + start_seq = random.choice(self.word) + rand_text = start_seq.lower() + self._word_prob = { + i: self.counts[i] / self.n + for i in self.word + if self.counts[i] / self.n >= prob + } + return self._next_word( + rand_text, N, output_str, prob=prob, duplicate=duplicate + )
+ + + def _next_word( + self, + text: str, + N: int, + output_str: bool, + prob: float, + duplicate: bool = False, + ): + words = [] + words.append(text) + word_list = list(self._word_prob.keys()) + if N > len(word_list): + N = len(word_list) + for _ in range(N): + w = random.choice(word_list) + if duplicate is False: + while w in words: + w = random.choice(word_list) + words.append(w) + + if output_str: + return "".join(words) + return words
+ + + +
+[docs] +class Bigram: + """ + Text generator using Bigram + + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + """ + +
+[docs] + def __init__(self, name: str = "tnc"): + if name == "tnc": + self.uni = tnc_word_freqs_unigram() + self.bi = tnc_word_freqs_bigram() + self.uni_keys = list(self.uni.keys()) + self.bi_keys = list(self.bi.keys()) + self.words = [i[-1] for i in self.bi_keys]
+ + +
+[docs] + def prob(self, t1: str, t2: str) -> float: + """ + probability of word + + :param int t1: text 1 + :param int t2: text 2 + + :return: probability value + :rtype: float + """ + try: + v = self.bi[(t1, t2)] / self.uni[t1] + except ZeroDivisionError: + v = 0.0 + return v
+ + +
+[docs] + def gen_sentence( + self, + start_seq: str = "", + N: int = 4, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False, + ) -> Union[List[str], str]: + """ + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence + + :return: list of words or a word string + :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Bigram + + gen = Bigram() + + gen.gen_sentence("แมว") + # output: 'แมวไม่ได้รับเชื้อมัน' + """ + if not start_seq: + start_seq = random.choice(self.words) + late_word = start_seq + list_word = [] + list_word.append(start_seq) + + for _ in range(N): + if duplicate: + temp = [j for j in self.bi_keys if j[0] == late_word] + else: + temp = [ + j + for j in self.bi_keys + if j[0] == late_word and j[1] not in list_word + ] + probs = [self.prob(late_word, next_word[-1]) for next_word in temp] + p2 = [j for j in probs if j >= prob] + if len(p2) == 0: + break + items = temp[probs.index(random.choice(p2))] + late_word = items[-1] + list_word.append(late_word) + + if output_str: + return "".join(list_word) + + return list_word
+
+ + + +
+[docs] +class Trigram: + """ + Text generator using Trigram + + :param str name: corpus name + * *tnc* - Thai National Corpus (default) + """ + +
+[docs] + def __init__(self, name: str = "tnc"): + if name == "tnc": + self.uni = tnc_word_freqs_unigram() + self.bi = tnc_word_freqs_bigram() + self.ti = tnc_word_freqs_trigram() + self.uni_keys = list(self.uni.keys()) + self.bi_keys = list(self.bi.keys()) + self.ti_keys = list(self.ti.keys()) + self.words = [i[-1] for i in self.bi_keys]
+ + +
+[docs] + def prob(self, t1: str, t2: str, t3: str) -> float: + """ + probability of word + + :param int t1: text 1 + :param int t2: text 2 + :param int t3: text 3 + + :return: probability value + :rtype: float + """ + try: + v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)] + except ZeroDivisionError: + v = 0.0 + + return v
+ + +
+[docs] + def gen_sentence( + self, + start_seq: str = "", + N: int = 4, + prob: float = 0.001, + output_str: bool = True, + duplicate: bool = False, + ) -> Union[List[str], str]: + """ + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence + + :return: list of words or a word string + :rtype: List[str], str + + :Example: + :: + + from pythainlp.generate import Trigram + + gen = Trigram() + + gen.gen_sentence() + # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' + """ + if not start_seq: + start_seq = random.choice(self.bi_keys) + late_word = start_seq + list_word = [] + list_word.append(start_seq) + + for i in range(N): + if duplicate: + temp = [j for j in self.ti_keys if j[:2] == late_word] + else: + temp = [ + j + for j in self.ti_keys + if j[:2] == late_word and j[1:] not in list_word + ] + probs = [self.prob(word[0], word[1], word[2]) for word in temp] + p2 = [j for j in probs if j >= prob] + if len(p2) == 0: + break + items = temp[probs.index(random.choice(p2))] + late_word = items[1:] + list_word.append(late_word) + + listdata = [] + for i in list_word: + for j in i: + if j not in listdata: + listdata.append(j) + + if output_str: + return "".join(listdata) + + return listdata
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/generate/wangchanglm.html b/5.1/_modules/pythainlp/generate/wangchanglm.html new file mode 100644 index 0000000..6cc296f --- /dev/null +++ b/5.1/_modules/pythainlp/generate/wangchanglm.html @@ -0,0 +1,343 @@ + + + + + + + + pythainlp.generate.wangchanglm — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.generate.wangchanglm

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import re
+
+import torch
+
+
+
+[docs] +class WangChanGLM: +
+[docs] + def __init__(self): + self.exclude_pattern = re.compile(r'[^ก-๙]+') + self.stop_token = "\n" + self.PROMPT_DICT = { + "prompt_input": ( + "<context>: {input}\n<human>: {instruction}\n<bot>: " + ), + "prompt_no_input": ( + "<human>: {instruction}\n<bot>: " + ), + "prompt_chatbot": ( + "<human>: {human}\n<bot>: {bot}" + ), + }
+ +
+[docs] + def is_exclude(self, text:str)->bool: + return bool(self.exclude_pattern.search(text))
+ +
+[docs] + def load_model( + self, + model_path:str="pythainlp/wangchanglm-7.5B-sft-en-sharded", + return_dict:bool=True, + load_in_8bit:bool=False, + device:str="cuda", + torch_dtype=torch.float16, + offload_folder:str="./", + low_cpu_mem_usage:bool=True + ): + """ + Load model + + :param str model_path: model path + :param bool return_dict: return dict + :param bool load_in_8bit: load model in 8bit + :param str device: device (cpu, cuda or other) + :param torch_dtype torch_dtype: torch_dtype + :param str offload_folder: offload folder + :param bool low_cpu_mem_usage: low cpu mem usage + """ + import pandas as pd + from transformers import AutoModelForCausalLM, AutoTokenizer + self.device = device + self.torch_dtype = torch_dtype + self.model_path = model_path + self.model = AutoModelForCausalLM.from_pretrained( + self.model_path, + return_dict=return_dict, + load_in_8bit=load_in_8bit, + device_map=device, + torch_dtype=torch_dtype, + offload_folder=offload_folder, + low_cpu_mem_usage=low_cpu_mem_usage + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.df = pd.DataFrame(self.tokenizer.vocab.items(), columns=['text', 'idx']) + self.df['is_exclude'] = self.df.text.map(self.is_exclude) + self.exclude_ids = self.df[self.df.is_exclude is True].idx.tolist()
+ +
+[docs] + def gen_instruct( + self, + text:str, + max_new_tokens:int=512, + top_p:float=0.95, + temperature:float=0.9, + top_k:int=50, + no_repeat_ngram_size:int=2, + typical_p:float=1., + thai_only:bool=True, + skip_special_tokens:bool=True + ): + """ + Generate Instruct + + :param str text: text + :param int max_new_tokens: maximum number of new tokens + :param float top_p: top p + :param float temperature: temperature + :param int top_k: top k + :param int no_repeat_ngram_size: do not repeat ngram size + :param float typical_p: typical p + :param bool thai_only: Thai only + :param bool skip_special_tokens: skip special tokens + :return: the answer from Instruct + :rtype: str + """ + batch = self.tokenizer(text, return_tensors="pt") + with torch.autocast(device_type=self.device, dtype=self.torch_dtype): + if thai_only: + output_tokens = self.model.generate( + input_ids=batch["input_ids"], + max_new_tokens=max_new_tokens, # 512 + begin_suppress_tokens = self.exclude_ids, + no_repeat_ngram_size=no_repeat_ngram_size, + #oasst k50 + top_k=top_k, + top_p=top_p, # 0.95 + typical_p=typical_p, + temperature=temperature, # 0.9 + ) + else: + output_tokens = self.model.generate( + input_ids=batch["input_ids"], + max_new_tokens=max_new_tokens, # 512 + no_repeat_ngram_size=no_repeat_ngram_size, + #oasst k50 + top_k=top_k, + top_p=top_p, # 0.95 + typical_p=typical_p, + temperature=temperature, # 0.9 + ) + return self.tokenizer.decode(output_tokens[0][len(batch["input_ids"][0]):], skip_special_tokens=skip_special_tokens)
+ +
+[docs] + def instruct_generate( + self, + instruct: str, + context: str = None, + max_new_tokens=512, + temperature: float =0.9, + top_p: float = 0.95, + top_k:int=50, + no_repeat_ngram_size:int=2, + typical_p:float=1, + thai_only:bool=True, + skip_special_tokens:bool=True + ): + """ + Generate Instruct + + :param str instruct: Instruct + :param str context: context + :param int max_new_tokens: maximum number of new tokens + :param float top_p: top p + :param float temperature: temperature + :param int top_k: top k + :param int no_repeat_ngram_size: do not repeat ngram size + :param float typical_p: typical p + :param bool thai_only: Thai only + :param bool skip_special_tokens: skip special tokens + :return: the answer from Instruct + :rtype: str + + :Example: + :: + + from pythainlp.generate.wangchanglm import WangChanGLM + import torch + + model = WangChanGLM() + + model.load_model(device="cpu",torch_dtype=torch.bfloat16) + + print(model.instruct_generate(instruct="ขอวิธีลดน้ำหนัก")) + # output: ลดน้ําหนักให้ได้ผล ต้องทําอย่างค่อยเป็นค่อยไป + # ปรับเปลี่ยนพฤติกรรมการกินอาหาร + # ออกกําลังกายอย่างสม่ําเสมอ + # และพักผ่อนให้เพียงพอ + # ที่สําคัญควรหลีกเลี่ยงอาหารที่มีแคลอรี่สูง + # เช่น อาหารทอด อาหารมัน อาหารที่มีน้ําตาลสูง + # และเครื่องดื่มแอลกอฮอล์ + + """ + if context in (None, ""): + prompt = self.PROMPT_DICT['prompt_no_input'].format_map( + {'instruction': instruct, 'input': ''} + ) + else: + prompt = self.PROMPT_DICT['prompt_input'].format_map( + {'instruction': instruct, 'input': context} + ) + result = self.gen_instruct( + prompt, + max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + temperature=temperature, + no_repeat_ngram_size=no_repeat_ngram_size, + typical_p=typical_p, + thai_only=thai_only, + skip_special_tokens=skip_special_tokens + ) + return result
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/khavee/core.html b/5.1/_modules/pythainlp/khavee/core.html new file mode 100644 index 0000000..a7bf45b --- /dev/null +++ b/5.1/_modules/pythainlp/khavee/core.html @@ -0,0 +1,822 @@ + + + + + + + + pythainlp.khavee.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.khavee.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: C901
+
+from typing import List, Union
+
+from pythainlp.tokenize import subword_tokenize
+from pythainlp.util import remove_tonemark, sound_syllable
+
+
+
+[docs] +class KhaveeVerifier: +
+[docs] + def __init__(self): + """ + KhaveeVerifier: Thai Poetry verifier + """
+ + +
+[docs] + def check_sara(self, word: str) -> str: + """ + Check the vowels in the Thai word. + + :param str word: Thai word + :return: vowel name of the word + :rtype: str + + :Example: + :: + + from pythainlp.khavee import KhaveeVerifier + + kv = KhaveeVerifier() + + print(kv.check_sara("เริง")) + # output: 'เออ' + """ + sara = [] + countoa = 0 + + # In case of การันย์ + if "์" in word[-1]: + word = word[:-2] + + # In case of สระเดี่ยว + for i in word: + if i in ("ะ", "ั"): + sara.append("อะ") + elif i == "ิ": + sara.append("อิ") + elif i == "ุ": + sara.append("อุ") + elif i == "ึ": + sara.append("อึ") + elif i == "ี": + sara.append("อี") + elif i == "ู": + sara.append("อู") + elif i == "ื": + sara.append("อือ") + elif i == "เ": + sara.append("เอ") + elif i == "แ": + sara.append("แอ") + elif i == "า": + sara.append("อา") + elif i == "โ": + sara.append("โอ") + elif i == "ำ": + sara.append("อำ") + elif i == "อ": + countoa += 1 + sara.append("ออ") + elif i == "ั" and "ว" in word: + sara.append("อัว") + elif i in ("ไ", "ใ"): + sara.append("ไอ") + elif i == "็": + sara.append("ออ") + elif "รร" in word: + if self.check_marttra(word) == "กม": + sara.append("อำ") + else: + sara.append("อะ") + + # In case of ออ + if countoa == 1 and "อ" in word[-1] and "เ" not in word: + sara.remove("ออ") + + # In case of เอ เอ + countA = 0 + for i in sara: + if i == "เอ": + countA = countA + 1 + if countA > 1: + sara.remove("เอ") + sara.remove("เอ") + sara.append("แ") + + # In case of สระประสม + if "เอ" in sara and "อะ" in sara: + sara.remove("เอ") + sara.remove("อะ") + sara.append("เอะ") + elif "แอ" in sara and "อะ" in sara: + sara.remove("แอ") + sara.remove("อะ") + sara.append("แอะ") + + if "เอะ" in sara and "ออ" in sara: + sara.remove("เอะ") + sara.remove("ออ") + sara.append("เออะ") + elif "เอ" in sara and "อิ" in sara: + sara.remove("เอ") + sara.remove("อิ") + sara.append("เออ") + elif "เอ" in sara and "ออ" in sara and "อ" in word[-1]: + sara.remove("เอ") + sara.remove("ออ") + sara.append("เออ") + elif "โอ" in sara and "อะ" in sara: + sara.remove("โอ") + sara.remove("อะ") + sara.append("โอะ") + elif "เอ" in sara and "อี" in sara: + sara.remove("เอ") + sara.remove("อี") + sara.append("เอีย") + elif "เอ" in sara and "อือ" in sara: + sara.remove("เอ") + sara.remove("อือ") + sara.append("อัว") + elif "เอ" in sara and "อา" in sara: + sara.remove("เอ") + sara.remove("อา") + sara.append("เอา") + elif "เ" in word and "า" in word and "ะ" in word: + sara = [] + sara.append("เอาะ") + + if "อือ" in sara and "เออ" in sara: + sara.remove("เออ") + sara.remove("อือ") + sara.append("เอือ") + elif "ออ" in sara and len(sara) > 1: + sara.remove("ออ") + elif "ว" in word and len(sara) == 0: + sara.append("อัว") + + if "ั" in word and self.check_marttra(word) == "กา": + sara = [] + sara.append("ไอ") + + # In case of อ + if word == "เออะ": + sara = [] + sara.append("เออะ") + elif word == "เออ": + sara = [] + sara.append("เออ") + elif word == "เอ": + sara = [] + sara.append("เอ") + elif word == "เอะ": + sara = [] + sara.append("เอะ") + elif word == "เอา": + sara = [] + sara.append("เอา") + elif word == "เอาะ": + sara = [] + sara.append("เอาะ") + + if "ฤา" in word or "ฦา" in word: + sara = [] + sara.append("อือ") + elif "ฤ" in word or "ฦ" in word: + sara = [] + sara.append("อึ") + + # In case of กน + if not sara and len(word) == 2: + if word[-1] != "ร": + sara.append("โอะ") + else: + sara.append("ออ") + elif not sara and len(word) == 3: + sara.append("ออ") + + # In case of บ่ + if word == "บ่": + sara = [] + sara.append("ออ") + + if "ํ" in word: + sara = [] + sara.append("อำ") + + if "เ" in word and "ื" in word and "อ" in word: + sara = [] + sara.append("เอือ") + + if not sara: + return "Can't find Sara in this word" + + return sara[0]
+ + +
+[docs] + def check_marttra(self, word: str) -> str: + """ + Check the Thai spelling Section in the Thai word. + + :param str word: Thai word + :return: name of spelling Section of the word. + :rtype: str + + :Example: + :: + + from pythainlp.khavee import KhaveeVerifier + + kv = KhaveeVerifier() + + print(kv.check_marttra("สาว")) + # output: 'เกอว' + """ + if word[-1] == "ร" and word[-2] in ["ต", "ท"]: + word = word[:-1] + word = self.handle_karun_sound_silence(word) + word = remove_tonemark(word) + if ( + "ำ" in word + or ("ํ" in word and "า" in word) + or "ไ" in word + or "ใ" in word + ): + return "กา" + elif ( + word[-1] in ["า", "ะ", "ิ", "ี", "ุ", "ู", "อ"] + or ("ี" in word and "ย" in word[-1]) + or ("ื" in word and "อ" in word[-1]) + ): + return "กา" + elif word[-1] in ["ง"]: + return "กง" + elif word[-1] in ["ม"]: + return "กม" + elif word[-1] in ["ย"]: + if "ั" in word: + return "กา" + else: + return "เกย" + elif word[-1] in ["ว"]: + return "เกอว" + elif word[-1] in ["ก", "ข", "ค", "ฆ"]: + return "กก" + elif word[-1] in [ + "จ", + "ช", + "ซ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ด", + "ต", + "ถ", + "ท", + "ธ", + "ศ", + "ษ", + "ส", + ]: + return "กด" + elif word[-1] in ["ญ", ", ณ", "น", "ร", "ล", "ฬ"]: + return "กน" + elif word[-1] in ["บ", "ป", "พ", "ฟ", "ภ"]: + return "กบ" + else: + if "็" in word: + return "กา" + else: + return "Cant find Marttra in this word"
+ + +
+[docs] + def is_sumpus(self, word1: str, word2: str) -> bool: + """ + Check the rhyme between two words. + + :param str word1: Thai word + :param str word2: Thai word + :return: boolean + :rtype: bool + + :Example: + :: + + from pythainlp.khavee import KhaveeVerifier + + kv = KhaveeVerifier() + + print(kv.is_sumpus("สรร", "อัน")) + # output: True + + print(kv.is_sumpus("สรร", "แมว")) + # output: False + """ + marttra1 = self.check_marttra(word1) + marttra2 = self.check_marttra(word2) + sara1 = self.check_sara(word1) + sara2 = self.check_sara(word2) + if sara1 == "อะ" and marttra1 == "เกย": + sara1 = "ไอ" + marttra1 = "กา" + elif sara2 == "อะ" and marttra2 == "เกย": + sara2 = "ไอ" + marttra2 = "กา" + if sara1 == "อำ" and marttra1 == "กม": + sara1 = "อำ" + marttra1 = "กา" + elif sara2 == "อำ" and marttra2 == "กม": + sara2 = "อำ" + marttra2 = "กา" + return bool(marttra1 == marttra2 and sara1 == sara2)
+ + +
+[docs] + def check_karu_lahu(self, text): + if ( + self.check_marttra(text) != "กา" + or ( + self.check_marttra(text) == "กา" + and self.check_sara(text) + in [ + "อา", + "อี", + "อือ", + "อู", + "เอ", + "แอ", + "โอ", + "ออ", + "เออ", + "เอีย", + "เอือ", + "อัว", + ] + ) + or self.check_sara(text) in ["อำ", "ไอ", "เอา"] + ) and text not in ["บ่", "ณ", "ธ", "ก็"]: + return "karu" + else: + return "lahu"
+ + +
+[docs] + def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: + """ + Check the suitability of the poem according to Thai principles. + + :param str text: Thai poem + :param int k_type: type of Thai poem + :return: the check results of the suitability of the poem according to Thai principles. + :rtype: Union[List[str], str] + + :Example: + :: + + from pythainlp.khavee import KhaveeVerifier + + kv = KhaveeVerifier() + + print(kv.check_klon( + 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง \ + มีคนจับจอง เขาชื่อน้องเธียร', + k_type=4 + )) + # output: The poem is correct according to the principle. + + print(kv.check_klon( + 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง \ + เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร', + k_type=4 + )) + # output: [ + "Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", + "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2" + ] + """ + if k_type == 8: + try: + error = [] + list_sumpus_sent1 = [] + list_sumpus_sent2h = [] + list_sumpus_sent2l = [] + list_sumpus_sent3 = [] + list_sumpus_sent4 = [] + for i, sent in enumerate(text.split()): + sub_sent = subword_tokenize(sent, engine="dict") + if len(sub_sent) > 10: + error.append( + "In sentence " + + str(i + 2) + + ", there are more than 10 words. " + + str(sub_sent) + ) + if (i + 1) % 4 == 1: + list_sumpus_sent1.append(sub_sent[-1]) + elif (i + 1) % 4 == 2: + list_sumpus_sent2h.append( + [ + sub_sent[1], + sub_sent[2], + sub_sent[3], + sub_sent[4], + ] + ) + list_sumpus_sent2l.append(sub_sent[-1]) + elif (i + 1) % 4 == 3: + list_sumpus_sent3.append(sub_sent[-1]) + elif (i + 1) % 4 == 0: + list_sumpus_sent4.append(sub_sent[-1]) + if ( + len(list_sumpus_sent1) != len(list_sumpus_sent2h) + or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) + or len(list_sumpus_sent2l) != len(list_sumpus_sent3) + or len(list_sumpus_sent3) != len(list_sumpus_sent4) + or len(list_sumpus_sent4) != len(list_sumpus_sent1) + ): + return "The poem does not have 4 complete sentences." + else: + for i in range(len(list_sumpus_sent1)): + countwrong = 0 + for j in list_sumpus_sent2h[i]: + if ( + self.is_sumpus(list_sumpus_sent1[i], j) + is False + ): + countwrong += 1 + if countwrong > 3: + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent1[i], + list_sumpus_sent2h[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if ( + self.is_sumpus( + list_sumpus_sent2l[i], list_sumpus_sent3[i] + ) + is False + ): + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent3[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if i > 0: + if ( + self.is_sumpus( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + is False + ): + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if not error: + return ( + "The poem is correct according to the principle." + ) + else: + return error + except: + return "Something went wrong. Make sure you enter it in the correct form of klon 8." + elif k_type == 4: + try: + error = [] + list_sumpus_sent1 = [] + list_sumpus_sent2h = [] + list_sumpus_sent2l = [] + list_sumpus_sent3 = [] + list_sumpus_sent4 = [] + for i, sent in enumerate(text.split()): + sub_sent = subword_tokenize(sent, engine="dict") + if len(sub_sent) > 5: + error.append( + "In sentence " + + str(i + 2) + + ", there are more than 4 words. " + + str(sub_sent) + ) + if (i + 1) % 4 == 1: + list_sumpus_sent1.append(sub_sent[-1]) + elif (i + 1) % 4 == 2: + list_sumpus_sent2h.append([sub_sent[1], sub_sent[2]]) + list_sumpus_sent2l.append(sub_sent[-1]) + elif (i + 1) % 4 == 3: + list_sumpus_sent3.append(sub_sent[-1]) + elif (i + 1) % 4 == 0: + list_sumpus_sent4.append(sub_sent[-1]) + if ( + len(list_sumpus_sent1) != len(list_sumpus_sent2h) + or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) + or len(list_sumpus_sent2l) != len(list_sumpus_sent3) + or len(list_sumpus_sent3) != len(list_sumpus_sent4) + or len(list_sumpus_sent4) != len(list_sumpus_sent1) + ): + return "The poem does not have 4 complete sentences." + else: + for i in range(len(list_sumpus_sent1)): + countwrong = 0 + for j in list_sumpus_sent2h[i]: + if ( + self.is_sumpus(list_sumpus_sent1[i], j) + is False + ): + countwrong += 1 + if countwrong > 1: + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent1[i], + list_sumpus_sent2h[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if ( + self.is_sumpus( + list_sumpus_sent2l[i], list_sumpus_sent3[i] + ) + is False + ): + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent3[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if i > 0: + if ( + self.is_sumpus( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + is False + ): + error.append( + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if not error: + return ( + "The poem is correct according to the principle." + ) + else: + return error + except: + return "Something went wrong. Make sure you enter it in the correct form." + + else: + return "Something went wrong. Make sure you enter it in the correct form."
+ + +
+[docs] + def check_aek_too( + self, text: Union[List[str], str], dead_syllable_as_aek: bool = False + ) -> Union[List[bool], List[str], bool, str]: + """ + Checker of Thai tonal words + + :param Union[List[str], str] text: Thai word or list of Thai words + :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek + :return: the check result if the word is aek or too or False (not both) or list of check results if input is list + :rtype: Union[List[bool], List[str], bool, str] + + :Example: + :: + + from pythainlp.khavee import KhaveeVerifier + + kv = KhaveeVerifier() + + # การเช็คคำเอกโท + print( + kv.check_aek_too("เอง"), + kv.check_aek_too("เอ่ง"), + kv.check_aek_too("เอ้ง"), + ) + # -> False, aek, too + print(kv.check_aek_too(["เอง", "เอ่ง", "เอ้ง"])) # ใช้ List ได้เหมือนกัน + # -> [False, 'aek', 'too'] + + + """ + if isinstance(text, list): + return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] + + if not isinstance(text, str): + raise TypeError("text must be str or iterable list[str]") + + word_characters = [*text] + if "่" in word_characters and "้" not in word_characters: + return "aek" + elif "้" in word_characters and "่" not in word_characters: + return "too" + if dead_syllable_as_aek and sound_syllable(text) == "dead": + return "aek" + else: + return False
+ + +
+[docs] + def handle_karun_sound_silence(self, word: str) -> str: + """ + Handle silent sounds in Thai words using '์' character (Karun) + by stripping all characters before the 'Karun' character that should be silenced + + :param str text: Thai word + :return: Thai word with silent words stripped + :rtype: str + """ + sound_silenced = word.endswith("์") + if not sound_silenced: + return word + thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" + locate_silenced = word.rfind("์") - 1 + can_silence_two = word[locate_silenced - 2] in thai_consonants + cut_off = 2 if can_silence_two else 1 + word = word[: locate_silenced + 1 - cut_off] + return word
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/lm/text_util.html b/5.1/_modules/pythainlp/lm/text_util.html new file mode 100644 index 0000000..1ee94a6 --- /dev/null +++ b/5.1/_modules/pythainlp/lm/text_util.html @@ -0,0 +1,219 @@ + + + + + + + + pythainlp.lm.text_util — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.lm.text_util

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: C901
+
+from typing import List, Tuple, Dict
+
+
+
+[docs] +def calculate_ngram_counts( + list_words: List[str], + n_min: int = 2, + n_max: int = 4) -> Dict[Tuple[str], int]: + """ + Calculates the counts of n-grams in the list words for the specified range. + + :param List[str] list_words: List of string + :param int n_min: The minimum n-gram size (default: 2). + :param int n_max: The maximum n-gram size (default: 4). + + :return: A dictionary where keys are n-grams and values are their counts. + :rtype: Dict[Tuple[str], int] + """ + + ngram_counts = {} + + for n in range(n_min, n_max + 1): + for i in range(len(list_words) - n + 1): + ngram = tuple(list_words[i:i + n]) + ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1 + + return ngram_counts
+ + + +
+[docs] +def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]: + """ + Remove repeated n-grams + + :param List[str] string_list: List of string + :param int n: n-gram size + :return: List of string + :rtype: List[str] + + :Example: + :: + + from pythainlp.lm import remove_repeated_ngrams + + remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1) + # output: ['เอา', 'แบบ', 'ไหน'] + """ + if not string_list or n <= 0: + return string_list + + unique_ngrams = set() + + output_list = [] + + for i in range(len(string_list)): + if i + n <= len(string_list): + ngram = tuple(string_list[i:i + n]) + + if ngram not in unique_ngrams: + unique_ngrams.add(ngram) + + if not output_list or output_list[-(n - 1):] != list(ngram[:-1]): + output_list.extend(ngram) + else: + output_list.append(ngram[-1]) + else: + for char in string_list[i:]: + if not output_list or output_list[-1] != char: + output_list.append(char) + + return output_list
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/morpheme/thaiwordcheck.html b/5.1/_modules/pythainlp/morpheme/thaiwordcheck.html new file mode 100644 index 0000000..25f9948 --- /dev/null +++ b/5.1/_modules/pythainlp/morpheme/thaiwordcheck.html @@ -0,0 +1,272 @@ + + + + + + + + pythainlp.morpheme.thaiwordcheck — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.morpheme.thaiwordcheck

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Check if a word is a "native Thai word"
+
+Adapted from
+https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
+
+References
+- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ \
+    http://www.trueplookpanya.com/learning/detail/30589-043067
+- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
+"""
+import re
+
+_THANTHAKHAT_CHAR = "\u0e4c"  # Thanthakhat (cancellation of sound)
+
+# Non-native Thai characters
+_TH_NON_NATIVE_CHARS = {
+    "ฆ",
+    "ณ",
+    "ฌ",
+    "ฎ",
+    "ฏ",
+    "ฐ",
+    "ฑ",
+    "ฒ",
+    "ธ",
+    "ศ",
+    "ษ",
+    "ฬ",
+    _THANTHAKHAT_CHAR,
+}
+
+# Native Thai final consonants
+_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}
+
+# Known native Thai words (exceptions)
+_TH_NATIVE_WORDS = {
+    "ฆ่า",
+    "เฆี่ยน",
+    "ศึก",
+    "ศอก",
+    "เศิก",
+    "เศร้า",
+    "ธ",
+    "ณ",
+    "ฯพณฯ",
+    "ใหญ่",
+    "หญ้า",
+    "ควาย",
+    "ความ",
+    "กริ่งเกรง",
+    "ผลิ",
+}
+
+# Diphthong prefixes (can start native Thai word)
+_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}
+
+# Thai consonant filter
+# O ANG (U+0E2D) is omitted, as it can be considered as vowel
+_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)
+
+
+
+[docs] +def is_native_thai(word: str) -> bool: + """ + Check if a word is an "native Thai word" (Thai: "คำไทยแท้") + This function is based on a simple heuristic algorithm + and cannot be entirely reliable. + + :param str word: word + :return: True or False + :rtype: bool + + :Example: + + English word:: + + from pythainlp.util import is_native_thai + + is_native_thai("Avocado") + # output: False + + Native Thai word:: + + is_native_thai("มะม่วง") + # output: True + is_native_thai("ตะวัน") + # output: True + + Non-native Thai word:: + + is_native_thai("สามารถ") + # output: False + is_native_thai("อิสริยาภรณ์") + # output: False + """ + if not isinstance(word, str) or not word.strip(): + return False + + word = word.strip() + + # Known native Thai words (exceptions) + if word in _TH_NATIVE_WORDS: + return True + + # If a word contains non-Thai chars, it is not a native Thai + if any(ch in word for ch in _TH_NON_NATIVE_CHARS): + return False + + # If it does not contain any Thai consonants -> it cannot be Thai + chs = re.findall(_TH_CONSONANTS_PATTERN, word) + if not chs: + return False + + # If there's only one Thai consonant -> it can be a native Thai + if len(chs) == 1: + return True + + # If a word ends with native final, it can be a native Thai + if word[-1] in _TH_NATIVE_FINALS: + return True + + # Note: This will not work, as it check the whole word, not the prefix. + # Prefix-sensitive tokenization is required in order to be able to check this. + if word in _TH_PREFIX_DIPHTHONG: + return True + + return False
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/morpheme/word_formation.html b/5.1/_modules/pythainlp/morpheme/word_formation.html new file mode 100644 index 0000000..d263d59 --- /dev/null +++ b/5.1/_modules/pythainlp/morpheme/word_formation.html @@ -0,0 +1,200 @@ + + + + + + + + pythainlp.morpheme.word_formation — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.morpheme.word_formation

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from pythainlp import thai_consonants
+
+
+
+[docs] +def nighit(w1: str, w2: str) -> str: + """ + Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new \ + words from Pali language in Thai. + The function use simple method to create new Thai word from two words \ + that the root is from Pali language. + + Read more: https://www.trueplookpanya.com/learning/detail/1180 + + :param str w1: A Thai word that has a nighit. + :param str w2: A Thai word. + :return: Thai word. + :rtype: str + :Example: + + :: + + from pythainlp.morpheme import nighit + + assert nighit("สํ","คีต")=="สังคีต" + assert nighit("สํ","จร")=="สัญจร" + assert nighit("สํ","ฐาน")=="สัณฐาน" + assert nighit("สํ","นิษฐาน")=="สันนิษฐาน" + assert nighit("สํ","ปทา")=="สัมปทา" + assert nighit("สํ","โยค")=="สังโยค" + """ + if not str(w1).endswith('ํ') and len(w1) != 2: + raise NotImplementedError(f"The function doesn't support {w1}.") + list_w1 = list(w1) + list_w2 = list(w2) + newword = list() + newword.append(list_w1[0]) + newword.append("ั") + consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0] + if consonant_start in ["ก", "ช", "ค", "ข", "ง"]: + newword.append("ง") + elif consonant_start in ["จ", "ฉ", "ช", "ฌ"]: + newword.append("ญ") + elif consonant_start in ["ฎ", "ฐ", "ฑ", "ณ"]: + newword.append("ณ") + elif consonant_start in ["ด", "ถ", "ท", "ธ", "น"]: + newword.append("น") + elif consonant_start in ["ป", "ผ", "พ", "ภ"]: + newword.append("ม") + elif consonant_start in ["ย", "ร", "ล", "ฬ", "ว", "ศ", "ษ", "ส", "ห"]: + newword.append("ง") + else: + raise NotImplementedError(f""" + The function doesn't support {w1} and {w2}. + """) + newword.extend(list_w2) + return ''.join(newword)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/parse/core.html b/5.1/_modules/pythainlp/parse/core.html new file mode 100644 index 0000000..a36594c --- /dev/null +++ b/5.1/_modules/pythainlp/parse/core.html @@ -0,0 +1,263 @@ + + + + + + + + pythainlp.parse.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.parse.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Union
+
+_tagger = None
+_tagger_name = ""
+
+
+
+[docs] +def dependency_parsing( + text: str, + model: Union[str, None] = None, + tag: str = "str", + engine: str = "esupar", +) -> Union[List[List[str]], str]: + """ + Dependency Parsing + + :param str text: text to apply dependency parsing to + :param str model: model for using with engine \ + (for esupar and transformers_ud) + :param str tag: output type (str or list) + :param str engine: the name of dependency parser + :return: str (conllu) or List + :rtype: Union[List[List[str]], str] + + **Options for engine** + * *esupar* (default) - Tokenizer, POS tagger and Dependency parser \ + using BERT/RoBERTa/DeBERTa models. `GitHub \ + <https://github.com/KoichiYasuoka/esupar>`_ + * *spacy_thai* - Tokenizer, POS tagger, and dependency parser \ + for the Thai language, using Universal Dependencies. \ + `GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_ + * *transformers_ud* - TransformersUD \ + `GitHub <https://github.com/KoichiYasuoka/>`_ + * *ud_goeswith* - POS tagging and dependency parsing \ + using `goeswith` for subwords + + **Options for model (esupar engine)** + * *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \ + `Huggingface \ + <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_ + * *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \ + pre-trained on Thai Wikipedia texts for POS tagging and \ + dependency parsing `Huggingface \ + <https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_ + * *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \ + pre-trained on Thai Wikipedia texts for POS tagging and \ + dependency parsing. (syllable level) `Huggingface \ + <https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_ + * *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \ + pre-trained on Thai Wikipedia texts for POS tagging \ + and dependency parsing. (char level) `Huggingface \ + <https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_ + + If you want to train models for esupar, you can read \ + `Huggingface <https://github.com/KoichiYasuoka/esupar>`_ + + **Options for model (transformers_ud engine)** + * *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \ + DeBERTa(V2) model pretrained on Thai Wikipedia texts \ + for dependency parsing (head-detection using Universal \ + Dependencies) and question-answering, derived from \ + deberta-base-thai. \ + trained by th_blackboard.conll. `Huggingface \ + <https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_ + * *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \ + roberta model pretrained on Thai Wikipedia texts \ + for dependency parsing. `Huggingface \ + <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_ + + **Options for model (ud_goeswith engine)** + * *KoichiYasuoka/deberta-base-thai-ud-goeswith* (default) - \ + This is a DeBERTa(V2) model pre-trained on Thai Wikipedia \ + texts for POS tagging and dependency parsing (using goeswith for subwords) \ + `Huggingface <https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-goeswith>`_ + + :Example: + :: + + from pythainlp.parse import dependency_parsing + + print(dependency_parsing("ผมเป็นคนดี", engine="esupar")) + # output: + # 1 ผม _ PRON _ _ 3 nsubj _ SpaceAfter=No + # 2 เป็น _ VERB _ _ 3 cop _ SpaceAfter=No + # 3 คน _ NOUN _ _ 0 root _ SpaceAfter=No + # 4 ดี _ VERB _ _ 3 acl _ SpaceAfter=No + + print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai")) + # output: + # 1 ผม PRON PPRS _ 2 nsubj _ SpaceAfter=No + # 2 เป็น VERB VSTA _ 0 ROOT _ SpaceAfter=No + # 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No + """ + global _tagger, _tagger_name + + if _tagger_name != engine: + if engine == "esupar": + from pythainlp.parse.esupar_engine import Parse + + _tagger = Parse(model=model) + elif engine == "transformers_ud": + from pythainlp.parse.transformers_ud import Parse + + _tagger = Parse(model=model) + elif engine == "spacy_thai": + from pythainlp.parse.spacy_thai_engine import Parse + + _tagger = Parse() + elif engine == "ud_goeswith": + from pythainlp.parse.ud_goeswith import Parse + + _tagger = Parse(model=model) + else: + raise NotImplementedError("The engine doesn't support.") + + _tagger_name = engine + + return _tagger(text, tag=tag)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/phayathaibert/core.html b/5.1/_modules/pythainlp/phayathaibert/core.html new file mode 100644 index 0000000..fbb5c97 --- /dev/null +++ b/5.1/_modules/pythainlp/phayathaibert/core.html @@ -0,0 +1,654 @@ + + + + + + + + pythainlp.phayathaibert.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.phayathaibert.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+import random
+import re
+import warnings
+from typing import Callable, List, Tuple, Union
+
+from transformers import (
+    CamembertTokenizer,
+)
+
+from pythainlp.tokenize import word_tokenize
+
+_PAT_URL = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
+
+_model_name = "clicknext/phayathaibert"
+_tokenizer = CamembertTokenizer.from_pretrained(_model_name)
+
+
+
+[docs] +class ThaiTextProcessor: +
+[docs] + def __init__(self): + ( + self._TK_UNK, + self._TK_REP, + self._TK_WREP, + self._TK_URL, + self._TK_END, + ) = "<unk> <rep> <wrep> <url> </s>".split() + self.SPACE_SPECIAL_TOKEN = "<_>"
+ + +
+[docs] + def replace_url(self, text: str) -> str: + """ + Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965) + :param str text: text to replace url + :return: text where urls are replaced + :rtype: str + :Example: + >>> replace_url("go to https://github.com") + go to <url> + """ + return re.sub(_PAT_URL, self._TK_URL, text)
+ + +
+[docs] + def rm_brackets(self, text: str) -> str: + """ + Remove all empty brackets and artifacts within brackets from `text`. + :param str text: text to remove useless brackets + :return: text where all useless brackets are removed + :rtype: str + :Example: + >>> rm_brackets("hey() whats[;] up{*&} man(hey)") + hey whats up man(hey) + """ + # remove empty brackets + new_line = re.sub(r"\(\)", "", text) + new_line = re.sub(r"\{\}", "", new_line) + new_line = re.sub(r"\[\]", "", new_line) + # brackets with only punctuations + new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) + new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) + new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) + # artifiacts after ( + new_line = re.sub( + r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + # artifacts before ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line + ) + return new_line
+ + +
+[docs] + def replace_newlines(self, text: str) -> str: + """ + Replace newlines in `text` with spaces. + :param str text: text to replace all newlines with spaces + :return: text where all newlines are replaced with spaces + :rtype: str + :Example: + >>> rm_useless_spaces("hey whats\n\nup") + hey whats up + """ + + return re.sub(r"[\n]", " ", text.strip())
+ + +
+[docs] + def rm_useless_spaces(self, text: str) -> str: + """ + Remove multiple spaces in `text`. (code from `fastai`) + :param str text: text to replace useless spaces + :return: text where all spaces are reduced to one + :rtype: str + :Example: + >>> rm_useless_spaces("oh no") + oh no + """ + return re.sub(" {2,}", " ", text)
+ + +
+[docs] + def replace_spaces(self, text: str, space_token: str = "<_>") -> str: + """ + Replace spaces with _ + :param str text: text to replace spaces + :return: text where all spaces replaced with _ + :rtype: str + :Example: + >>> replace_spaces("oh no") + oh_no + """ + return re.sub(" ", space_token, text)
+ + +
+[docs] + def replace_rep_after(self, text: str) -> str: + """ + Replace repetitions at the character level in `text` + :param str text: input text to replace character repetition + :return: text with repetitive tokens removed. + :rtype: str + :Example: + >>> text = "กาาาาาาา" + >>> replace_rep_after(text) + 'กา' + """ + + def _replace_rep(m): + c, cc = m.groups() + return f"{c}" + + re_rep = re.compile(r"(\S)(\1{3,})") + return re_rep.sub(_replace_rep, text)
+ + +
+[docs] + def replace_wrep_post(self, toks: List[str]) -> List[str]: + """ + Replace repetitive words post tokenization; + fastai `replace_wrep` does not work well with Thai. + :param List[str] toks: list of tokens + :return: list of tokens where repetitive words are removed. + :rtype: List[str] + :Example: + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post(toks) + ['กา', 'น้ำ'] + """ + previous_word = "" + rep_count = 0 + res = [] + for current_word in toks + [self._TK_END]: + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [previous_word] + rep_count = 0 + else: + res.append(previous_word) + previous_word = current_word + + return res[1:]
+ + +
+[docs] + def remove_space(self, toks: List[str]) -> List[str]: + """ + Do not include space for bag-of-word models. + :param List[str] toks: list of tokens + :return: List of tokens where space tokens (" ") are filtered out + :rtype: List[str] + :Example: + >>> toks = ["ฉัน", "เดิน", " ", "กลับ", "บ้าน"] + >>> remove_space(toks) + ['ฉัน', 'เดิน', 'กลับ', 'บ้าน'] + """ + res = [] + for t in toks: + t = t.strip() + if t: + res.append(t) + + return res
+ + + # combine them together +
+[docs] + def preprocess( + self, + text: str, + pre_rules: List[Callable] = [ + rm_brackets, + replace_newlines, + rm_useless_spaces, + replace_spaces, + replace_rep_after, + ], + tok_func: Callable = word_tokenize, + ) -> str: + text = text.lower() + for rule in pre_rules: + text = rule(text) + toks = tok_func(text) + + return "".join(toks)
+
+ + + +
+[docs] +class ThaiTextAugmenter: +
+[docs] + def __init__(self) -> None: + from transformers import ( + AutoModelForMaskedLM, + AutoTokenizer, + pipeline, + ) + + self.tokenizer = AutoTokenizer.from_pretrained(_model_name) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained( + _model_name + ) + self.model = pipeline( + "fill-mask", + tokenizer=self.tokenizer, + model=self.model_for_masked_lm, + ) + self.processor = ThaiTextProcessor()
+ + +
+[docs] + def generate( + self, + sample_text: str, + word_rank: int, + max_length: int = 3, + sample: bool = False, + ) -> str: + sample_txt = sample_text + final_text = "" + for j in range(max_length): + input = self.processor.preprocess(sample_txt) + if sample: + random_word_idx = random.randint(0, 4) + output = self.model(input)[random_word_idx]["sequence"] + else: + output = self.model(input)[word_rank]["sequence"] + sample_txt = output + "<mask>" + final_text = sample_txt + + gen_txt = re.sub("<mask>", "", final_text) + + return gen_txt
+ + +
+[docs] + def augment( + self, + text: str, + num_augs: int = 3, + sample: bool = False, + ) -> List[str]: + """ + Text augmentation from PhayaThaiBERT + + :param str text: Thai text + :param int num_augs: an amount of augmentation text needed as an output + :param bool sample: whether to sample the text as an output or not,\ + true if more word diversity is needed + + :return: list of text augment + :rtype: List[str] + + :Example: + :: + + from pythainlp.augment.lm import ThaiTextAugmenter + + aug = ThaiTextAugmenter() + aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5) + + # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.', + 'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼', + 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] + """ + MAX_NUM_AUGS = 5 + augment_list = [] + + if num_augs <= MAX_NUM_AUGS: + for rank in range(num_augs): + gen_text = self.generate( + text, + rank, + sample=sample, + ) + processed_text = re.sub( + "<_>", " ", self.processor.preprocess(gen_text) + ) + augment_list.append(processed_text) + else: + raise ValueError( + f"augmentation of more than {num_augs} is exceeded \ + the default limit: {MAX_NUM_AUGS}" + ) + + return augment_list
+
+ + + +
+[docs] +class PartOfSpeechTagger: +
+[docs] + def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None: + # Load model directly + from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, + ) + + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model)
+ + +
+[docs] + def get_tag( + self, sentence: str, strategy: str = "simple" + ) -> List[List[Tuple[str, str]]]: + """ + Marks sentences with part-of-speech (POS) tags. + + :param str sentence: a list of lists of tokenized words + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] + + :Example: + + Labels POS for given sentence:: + + from pythainlp.phayathaibert.core import PartOfSpeechTagger + + tagger = PartOfSpeechTagger() + tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า") + # output: + # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] + """ + from transformers import TokenClassificationPipeline + + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) + outputs = pipeline(sentence) + word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]] + + return word_tags
+
+ + + +
+[docs] +class NamedEntityTagger: +
+[docs] + def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: + from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, + ) + + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model)
+ + +
+[docs] + def get_ner( + self, + text: str, + tag: bool = False, + pos: bool = False, + strategy: str = "simple", + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + """ + This function tags named entities in text in IOB format. + + :param str text: text in Thai to be tagged + :param bool pos: output with part-of-speech tags.\ + (PhayaThaiBERT is supported in PartOfSpeechTagger) + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuples associated with tokenized + words and NER tags + :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] + :Example: + + >>> from pythainlp.phayathaibert.core import NamedEntityTagger + >>> + >>> tagger = NamedEntityTagger() + >>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย") + [('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'), + ('จาก', 'LOCATION'), + ('ประเทศไทย', 'LOCATION')] + >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True) + 'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\ + <LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>' + """ + from transformers import TokenClassificationPipeline + + if pos: + warnings.warn( + "This model doesn't support output \ + postag and It doesn't output the postag." + ) + + sample_output = [] + tag_text_list = [] + current_pos = 0 + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) + outputs = pipeline(text) + + for token in outputs: + ner_tag = token["entity_group"] + begin_pos, end_pos = token["start"], token["end"] + if current_pos == 0: + text_tag = ( + text[:begin_pos] + + f"<{ner_tag}>" + + text[begin_pos:end_pos] + + f"</{ner_tag}>" + ) + else: + text_tag = ( + text[current_pos:begin_pos] + + f"<{ner_tag}>" + + text[begin_pos:end_pos] + + f"</{ner_tag}>" + ) + tag_text_list.append(text_tag) + sample_output.append((token["word"], token["entity_group"])) + current_pos = end_pos + + if tag: + return str("".join(tag_text_list)) + + return sample_output
+
+ + + +
+[docs] +def segment(sentence: str) -> List[str]: + """ + Subword tokenize of PhayaThaiBERT, \ + sentencepiece from WangchanBERTa model with vocabulary expansion. + + :param str sentence: text to be tokenized + :return: list of subwords + :rtype: list[str] + """ + if not sentence or not isinstance(sentence, str): + return [] + + return _tokenizer.tokenize(sentence)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/core.html b/5.1/_modules/pythainlp/soundex/core.html new file mode 100644 index 0000000..2d81125 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/core.html @@ -0,0 +1,222 @@ + + + + + + + + pythainlp.soundex.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.soundex.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai soundex
+
+Has three systems to choose from: Udom83 (default), LK82, and MetaSound
+"""
+from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE
+from pythainlp.soundex.lk82 import lk82
+from pythainlp.soundex.metasound import metasound
+from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
+from pythainlp.soundex.udom83 import udom83
+
+# Other Thai soundex systems (not implemented yet): Arun91, KSS97
+# [KSS97] https://linux.thai.net/~thep/soundex/soundex.html
+
+
+
+[docs] +def soundex( + text: str, engine: str = DEFAULT_SOUNDEX_ENGINE, length: int = 4 +) -> str: + """ + This function converts Thai text into phonetic code. + + :param str text: word + :param str engine: soundex engine + :param int length: preferred length of the Soundex code (default is 4)\ + for metasound and prayut_and_somchaip only + :return: Soundex code + :rtype: str + + :Options for engine: + * *udom83* (default) - Thai soundex algorithm proposed + by Vichit Lorchirachoonkul [#udom83]_ + * *lk82* - Thai soundex algorithm proposed by + Wannee Udompanich [#lk82]_ + * *metasound* - Thai soundex algorithm based on a combination + of Metaphone and Soundex proposed by Snae & Brückner [#metasound]_ + * *prayut_and_somchaip* - Thai-English Cross-Language Transliterated + Word Retrieval using Soundex Technique [#prayut_and_somchaip]_ + + :Example: + :: + + from pythainlp.soundex import soundex + + soundex("ลัก"), soundex("ลัก", engine='lk82'), \\ + soundex("ลัก", engine='metasound') + # output: ('ร100000', 'ร1000', 'ล100') + + soundex("รัก"), soundex("รัก", engine='lk82'), \\ + soundex("รัก", engine='metasound') + # output: ('ร100000', 'ร1000', 'ร100') + + soundex("รักษ์"), soundex("รักษ์", engine='lk82'), \\ + soundex("รักษ์", engine='metasound') + # output: ('ร100000', 'ร1000', 'ร100') + + soundex("บูรณการ"), soundex("บูรณการ", engine='lk82'), \\ + soundex("บูรณการ", engine='metasound') + # output: ('บ931900', 'บE419', 'บ551') + + soundex("ปัจจุบัน"), soundex("ปัจจุบัน", engine='lk82'), \\ + soundex("ปัจจุบัน", engine='metasound') + # output: ('ป775300', 'ป3E54', 'ป223') + + soundex("vp", engine="prayut_and_somchaip") + # output: '11' + soundex("วีพี", engine="prayut_and_somchaip") + # output: '11' + """ + if engine == "lk82": + _soundex = lk82(text) + elif engine == "prayut_and_somchaip": + _soundex = prayut_and_somchaip(text, length=length) + elif engine == "metasound": + _soundex = metasound(text, length=length) + else: # default, use "udom83" + _soundex = udom83(text) + return _soundex
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/lk82.html b/5.1/_modules/pythainlp/soundex/lk82.html new file mode 100644 index 0000000..cff9cc5 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/lk82.html @@ -0,0 +1,272 @@ + + + + + + + + pythainlp.soundex.lk82 — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.soundex.lk82

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai soundex - LK82 system
+
+Original paper:
+Vichit Lorchirachoonkul. 1982. A Thai soundex
+system. Information Processing & Management,
+18(5):243–255.
+https://doi.org/10.1016/0306-4573(82)90003-6
+
+Python implementation:
+by Korakot Chaovavanich
+https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
+"""
+import re
+
+from pythainlp.util import remove_tonemark
+
+_TRANS1 = str.maketrans(
+    "กขฃคฅฆงจฉชฌซศษสญยฎดฏตณนฐฑฒถทธบปผพภฝฟมรลฬฤฦวหฮอ",
+    "กกกกกกงจชชชซซซซยยดดตตนนททททททบปพพพฟฟมรรรรรวหหอ",
+)
+_TRANS2 = str.maketrans(
+    "กขฃคฅฆงจฉชซฌฎฏฐฑฒดตถทธศษสญณนรลฬฤฦบปพฟภผฝมำยวไใหฮาๅึืเแโุูอ",
+    "1111112333333333333333333444444445555555667777889AAABCDEEF",
+)
+
+# silenced
+_RE_KARANT = re.compile(r"จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ][ะ-ู]์")
+
+# signs, symbols, vowel that has no explicit sounds
+# Paiyannoi, Phinthu, Maiyamok, Maitaikhu, Nikhahit
+_RE_SIGN = re.compile(r"[\u0e2f\u0e3a\u0e46\u0e47\u0e4d]")
+
+
+
+[docs] +def lk82(text: str) -> str: + """ + This function converts Thai text into phonetic code with the + Thai soundex algorithm named **LK82** [#lk82]_. + + :param str text: Thai word + + :return: LK82 soundex of the given Thai word + :rtype: str + + :Example: + :: + + from pythainlp.soundex import lk82 + + lk82("ลัก") + # output: 'ร1000' + + lk82("รัก") + # output: 'ร1000' + + lk82("รักษ์") + # output: 'ร1000' + + lk82("บูรณการ") + # output: 'บE419' + + lk82("ปัจจุบัน") + # output: 'ป3E54' + """ + if not text or not isinstance(text, str): + return "" + + text = remove_tonemark(text) # 4. remove tone marks + text = _RE_KARANT.sub("", text) # 4. remove "karat" characters + text = _RE_SIGN.sub("", text) # 5. remove Mai tai khu, + + if not text: + return "" + + # 6. encode the first character + res = [] + if "ก" <= text[0] <= "ฮ": + res.append(text[0].translate(_TRANS1)) + text = text[1:] + else: + if len(text) > 1: + res.append(text[1].translate(_TRANS1)) + res.append(text[0].translate(_TRANS2)) + text = text[2:] + + # encode the rest + i_v = None # ตำแหน่งตัวคั่นล่าสุด (สระ) + len_text = len(text) + for i, c in enumerate(text): + if ( + c in "\u0e30\u0e31\u0e34\u0e35" + ): # 7. ตัวคั่นเฉยๆ/ Sara A, Mai Han-Akat, Sara I, Sara II + i_v = i + res.append("") + elif ( + c in "\u0e32\u0e36\u0e37\u0e39\u0e45" + ): # 8. คั่นและใส่/ Sara Aa, Sara Ue, Sara Uee, Sara Uu, Lankkhangyao + i_v = i + res.append(c.translate(_TRANS2)) + elif c == "\u0e38": # 9. สระอุ / Sara U + i_v = i + if i == 0 or (text[i - 1] not in "ตธ"): + res.append(c.translate(_TRANS2)) + else: + res.append("") + elif c in "\u0e2b\u0e2d": # หอ + if i + 1 < len_text and ( + text[i + 1] in "\u0e36\u0e37\u0e38\u0e39" + ): # Sara Ue, Sara Uee, Sara U, Sara Uu + res.append(c.translate(_TRANS2)) + elif c in "\u0e22\u0e23\u0e24\u0e26\u0e27": + if i_v == i - 1 or ( + i + 1 < len_text + and (text[i + 1] in "\u0e36\u0e37\u0e38\u0e39") + ): # Sara Ue, Sara Uee, Sara U, Sara Uu + res.append(c.translate(_TRANS2)) + else: + res.append(c.translate(_TRANS2)) # 12. + + # 13. remove repetitions + res2 = [res[0]] + for i in range(1, len(res)): + if res[i] != res[i - 1]: + res2.append(res[i]) + + # 14. fill with zeros + return ("".join(res2) + "0000")[:5]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/metasound.html b/5.1/_modules/pythainlp/soundex/metasound.html new file mode 100644 index 0000000..76e18a7 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/metasound.html @@ -0,0 +1,249 @@ + + + + + + + + pythainlp.soundex.metasound — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.soundex.metasound

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai soundex - MetaSound system
+
+References:
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with
+a Statistical Ontology for Analysing Names Given in Accordance
+with Thai Astrology.
+https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+"""
+
+_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
+_THANTHAKHAT = "์"  # \u0e4c
+_C1 = "กขฃคฆฅ"  # sound K -> coded letter 1
+_C2 = "จฉชฌซฐทฒดฎตสศษ"  # D -> 2
+_C3 = "ฟฝพผภบป"  # B -> 3
+_C4 = "ง"  # NG -> 4
+_C5 = "ลฬรนณฦญ"  # N -> 5
+_C6 = "ม"  # M -> 6
+_C7 = "ย"  # Y -> 7
+_C8 = "ว"  # W -> 8
+
+
+
+[docs] +def metasound(text: str, length: int = 4) -> str: + """ + This function converts Thai text into phonetic code with the + matching technique called **MetaSound** + [#metasound]_ (combination between Soundex and Metaphone algorithms). + MetaSound algorithm was developed specifically for the Thai language. + + :param str text: Thai text + :param int length: preferred length of the MetaSound code (default is 4) + + :return: MetaSound for the given text + :rtype: str + + :Example: + :: + + from pythainlp.soundex.metasound import metasound + + metasound("ลัก") + # output: 'ล100' + + metasound("รัก") + # output: 'ร100' + + metasound("รักษ์") + # output: 'ร100' + + metasound("บูรณการ", 5) + # output: 'บ5515' + + metasound("บูรณการ", 6)) + # output: 'บ55150' + + metasound("บูรณการ", 4) + # output: 'บ551' + """ + if not text or not isinstance(text, str): + return "" + + # keep only consonants and thanthakhat + chars = [] + for ch in text: + if ch in _CONS_THANTHAKHAT: + chars.append(ch) + + # remove karan (thanthakhat and a consonant before it) + i = 0 + while i < len(chars): + if chars[i] == _THANTHAKHAT: + if i > 0: + chars[i - 1] = " " + chars[i] = " " + i += 1 + + # retain first consonant, encode the rest + chars = chars[:length] + i = 1 + while i < len(chars): + if chars[i] in _C1: + chars[i] = "1" + elif chars[i] in _C2: + chars[i] = "2" + elif chars[i] in _C3: + chars[i] = "3" + elif chars[i] in _C4: + chars[i] = "4" + elif chars[i] in _C5: + chars[i] = "5" + elif chars[i] in _C6: + chars[i] = "6" + elif chars[i] in _C7: + chars[i] = "7" + elif chars[i] in _C8: + chars[i] = "8" + else: + chars[i] = "0" + i += 1 + + while len(chars) < length: + chars.append("0") + + return "".join(chars)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/prayut_and_somchaip.html b/5.1/_modules/pythainlp/soundex/prayut_and_somchaip.html new file mode 100644 index 0000000..f232659 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/prayut_and_somchaip.html @@ -0,0 +1,230 @@ + + + + + + + + pythainlp.soundex.prayut_and_somchaip — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.soundex.prayut_and_somchaip

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai-English Cross-Language Transliterated Word Retrieval
+using Soundex Technique
+
+References:
+Prayut Suwanvisat, Somchai Prasitjutrakul.Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf
+"""
+from pythainlp import thai_characters
+
+_C0 = "AEIOUHWYอ"
+_C1 = "BFPVบฝฟปผพภว"
+_C2 = "CGJKQSXZขฃคฅฆฉขฌกจซศษส"
+_C3 = "DTฎดฏตฐฑฒถทธ"
+_C4 = "Lลฬ"
+_C5 = "MNมณน"
+_C6 = "Rร"
+_C7 = "AEIOUอ"
+_C8 = "Hหฮ"
+_C1_1 = "Wว"
+_C9 = "Yยญ"
+_C52 = "ง"
+
+
+
+[docs] +def prayut_and_somchaip(text: str, length: int = 4) -> str: + """ + This function converts English-Thai Cross-Language Transliterated Word into + phonetic code with the matching technique called **Soundex** [#prayut_and_somchaip]_. + + :param str text: English-Thai Cross-Language Transliterated Word + :param int length: preferred length of the Soundex code (default is 4) + + :return: Soundex for the given text + :rtype: str + + :Example: + :: + + from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip + + prayut_and_somchaip("king", 2) + # output: '52' + + prayut_and_somchaip("คิง", 2) + # output: '52' + """ + if not text or not isinstance(text, str): + return "" + text = text.upper() + # keep only consonants (English-Thai) + chars = [] + for ch in text: + if ch in thai_characters + "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + chars.append(ch) + + i = 0 + while i < len(chars): + if i == 0 and chars[i] in _C0: + chars[i] = "0" + elif chars[i] in _C1: + chars[i] = "1" + elif chars[i] in _C2: + chars[i] = "2" + elif chars[i] in _C3: + chars[i] = "3" + elif chars[i] in _C4: + chars[i] = "4" + elif chars[i] in _C5: + chars[i] = "5" + elif chars[i] in _C6: + chars[i] = "6" + elif chars[i] in _C52: + chars[i] = "52" + elif chars[i] in _C7 and i != 0: + chars[i] = "7" + elif chars[i] in _C8 and i != 0: + chars[i] = "8" + elif chars[i] in _C1_1 and i != 0: + chars[i] = "1" + elif chars[i] in _C9 and i != 0: + chars[i] = "9" + else: + chars[i] = None + i += 1 + chars = list("".join([i for i in chars if i is not None])) + return "".join(chars[-length:])
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/sound.html b/5.1/_modules/pythainlp/soundex/sound.html new file mode 100644 index 0000000..a131374 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/sound.html @@ -0,0 +1,232 @@ + + + + + + + + pythainlp.soundex.sound — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.soundex.sound

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List
+
+import panphon
+import panphon.distance
+
+from pythainlp.tokenize import word_tokenize
+from pythainlp.transliterate import pronunciate, transliterate
+
+_ft = panphon.FeatureTable()
+_dst = panphon.distance.Distance()
+
+def _clean_ipa(ipa: str) -> str:
+    """
+    Clean IPA by removing tones and space between phonetic codes
+
+    :param str ipa: IPA text
+    :return: IPA with tones removed from the text
+    :rtype: str
+    """
+    return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()
+
+
+[docs] +def word2audio(word: str) -> str: + """ + Convert word to IPA + + :param str word: Thai word + :return: IPA with tones removed from the text + :rtype: str + + :Example: + :: + + from pythainlp.soundex.sound import word2audio + + word2audio("น้ำ") + # output : 'n aː m .' + """ + _word = word_tokenize(word) + _phone = [pronunciate(w, engine="w2p") for w in _word] + _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone] + return '.'.join(_ipa)
+ + +
+[docs] +def audio_vector(word: str) -> List[List[int]]: + """ + Convert audio to vector list + + :param str word: Thai word + :return: List of features from panphon + :rtype: List[List[int]] + + :Example: + :: + + from pythainlp.soundex.sound import audio_vector + + audio_vector("น้ำ") + # output : [[-1, 1, 1, -1, -1, -1, ...]] + """ + return _ft.word_to_vector_list(word2audio(word), numeric=True)
+ + +
+[docs] +def word_approximation(word: str, list_word: List[str]) -> List[float]: + """ + Thai Word Approximation + + :param str word: Thai word + :param str list_word: Thai word + :return: List of approximation of words (The smaller the value, the closer) + :rtype: List[float] + + :Example: + :: + + from pythainlp.soundex.sound import word_approximation + + word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"]) + # output : [0.0, 0.0, 3.875, 8.375] + """ + _word = word2audio(word) + _list_word = [word2audio(w) for w in list_word] + _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word] + return _distance
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/soundex/udom83.html b/5.1/_modules/pythainlp/soundex/udom83.html new file mode 100644 index 0000000..69b81a6 --- /dev/null +++ b/5.1/_modules/pythainlp/soundex/udom83.html @@ -0,0 +1,244 @@ + + + + + + + + pythainlp.soundex.udom83 — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.soundex.udom83

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai soundex - Udom83 system
+
+Original paper:
+Wannee Udompanich. String searching for Thai alphabet
+using Soundex compression technique. Master Thesis
+of Department of Computer Engineering Graduate
+School, Chulalongkorn University, 1983.
+http://cuir.car.chula.ac.th/handle/123456789/48471
+
+Python implementation:
+by Korakot Chaovavanich
+https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
+"""
+import re
+
+from pythainlp import thai_consonants
+
+_THANTHAKHAT = "\u0e4c"
+_RE_1 = re.compile(r"รร([\u0e40-\u0e44])")  # เ-ไ
+_RE_2 = re.compile(f"รร([{thai_consonants}][{thai_consonants}\u0e40-\u0e44])")
+_RE_3 = re.compile(f"รร([{thai_consonants}][\u0e30-\u0e39\u0e48-\u0e4c])")
+_RE_4 = re.compile(r"รร")
+_RE_5 = re.compile(f"ไ([{thai_consonants}]ย)")
+_RE_6 = re.compile(f"[ไใ]([{thai_consonants}])")
+_RE_7 = re.compile(r"\u0e33(ม[\u0e30-\u0e39])")
+_RE_8 = re.compile(r"\u0e33ม")
+_RE_9 = re.compile(r"\u0e33")  # ำ
+_RE_10 = re.compile(
+    f"จน์|มณ์|ณฑ์|ทร์|ตร์|"
+    f"[{thai_consonants}]{_THANTHAKHAT}|[{thai_consonants}]"
+    f"[\u0e30-\u0e39]{_THANTHAKHAT}"
+)
+_RE_11 = re.compile(r"[\u0e30-\u0e4c]")
+
+_TRANS1 = str.maketrans(
+    "กขฃคฅฆงจฉชฌซศษสฎดฏตฐฑฒถทธณนบปผพภฝฟมญยรลฬฤฦวอหฮ",
+    "กขขขขขงจชชชสสสสดดตตททททททนนบปพพพฟฟมยยรรรรรวอฮฮ",
+)
+_TRANS2 = str.maketrans(
+    "มวำกขฃคฅฆงยญณนฎฏดตศษสบปพภผฝฟหอฮจฉชซฌฐฑฒถทธรฤลฦ",
+    "0001111112233344444445555666666777778888889999",
+)
+
+
+
+[docs] +def udom83(text: str) -> str: + """ + This function converts Thai text into phonetic code with the + Thai soundex algorithm named **Udom83** [#udom83]_. + + :param str text: Thai word + + :return: Udom83 soundex + :rtype: str + + :Example: + :: + + from pythainlp.soundex import udom83 + + udom83("ลัก") + # output : 'ล100' + + udom83("รัก") + # output: 'ร100' + + udom83("รักษ์") + # output: 'ร100' + + udom83("บูรณการ") + # output: 'บ5515' + + udom83("ปัจจุบัน") + # output: 'ป775300' + """ + + if not text or not isinstance(text, str): + return "" + + text = _RE_1.sub("ัน\\1", text) + text = _RE_2.sub("ั\\1", text) + text = _RE_3.sub("ัน\\1", text) + text = _RE_4.sub("ัน", text) + text = _RE_5.sub("\\1", text) + text = _RE_6.sub("\\1ย", text) + text = _RE_7.sub("ม\\1", text) + text = _RE_8.sub("ม", text) + text = _RE_9.sub("ม", text) + text = _RE_10.sub("", text) + text = _RE_11.sub("", text) + + if not text: + return "" + + sd = "".join( + [text[0].translate(_TRANS1), text[1:].translate(_TRANS2), "000000"] + ) + + return sd[:7]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/spell/core.html b/5.1/_modules/pythainlp/spell/core.html new file mode 100644 index 0000000..52d6867 --- /dev/null +++ b/5.1/_modules/pythainlp/spell/core.html @@ -0,0 +1,342 @@ + + + + + + + + pythainlp.spell.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.spell.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Spell checking functions
+"""
+
+import itertools
+from typing import List
+
+from pythainlp.spell import DEFAULT_SPELL_CHECKER
+
+
+
+[docs] +def spell(word: str, engine: str = "pn") -> List[str]: + """ + Provides a list of possible correct spellings of the given word. + The list of words are from the words in the dictionary + that incurs an edit distance value of 1 or 2. + The result is a list of words sorted by their occurrences + in the spelling dictionary in descending order. + + :param str word: Word to check spell of + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. + + :return: list of possible correct words within 1 or 2 edit distance and + sorted by frequency of word occurrences in the spelling dictionary + in descending order. + :rtype: list[str] + + :Example: + :: + + from pythainlp.spell import spell + + spell("เส้นตรบ", engine="pn") + # output: ['เส้นตรง'] + + spell("เส้นตรบ") + # output: ['เส้นตรง'] + + spell("เส้นตรบ", engine="tltk") + # output: ['เส้นตรง'] + + spell("ครัช") + # output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส', + # 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด'] + + spell("กระปิ") + # output: ['กะปิ', 'กระบิ'] + + spell("สังเกตุ") + # output: ['สังเกต'] + + spell("เหตการณ") + # output: ['เหตุการณ์'] + """ + if engine == "phunspell": + from pythainlp.spell.phunspell import spell as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + elif engine == "symspellpy": + from pythainlp.spell.symspellpy import spell as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + elif engine == "tltk": + from pythainlp.spell.tltk import spell as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + else: + text_correct = DEFAULT_SPELL_CHECKER.spell(word) + + return text_correct
+ + + +
+[docs] +def correct(word: str, engine: str = "pn") -> str: + """ + Corrects the spelling of the given word by returning + the correctly spelled word. + + :param str word: word to correct spelling of + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + * *wanchanberta_thai_grammarly* - WanchanBERTa Thai Grammarly + :return: the corrected word + :rtype: str + + :Example: + :: + + from pythainlp.spell import correct + + correct("เส้นตรบ") + # output: 'เส้นตรง' + + correct("ครัช") + # output: 'ครับ' + + correct("สังเกตุ") + # output: 'สังเกต' + + correct("กระปิ") + # output: 'กะปิ' + + correct("เหตการณ") + # output: 'เหตุการณ์' + """ + if engine == "phunspell": + from pythainlp.spell.phunspell import correct as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + elif engine == "symspellpy": + from pythainlp.spell.symspellpy import correct as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + elif engine == "wanchanberta_thai_grammarly": + from pythainlp.spell.wanchanberta_thai_grammarly import correct as SPELL_CHECKER + + text_correct = SPELL_CHECKER(word) + + else: + text_correct = DEFAULT_SPELL_CHECKER.correct(word) + + return text_correct
+ + + +
+[docs] +def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]: + """ + Provides a list of possible correct spellings of sentence + + :param List[str] list_words: list of words in sentence + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + :return: list of possibly correct words + :rtype: List[List[str]] + + :Example: + :: + + from pythainlp.spell import spell_sent + + spell_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy') + # output: [['เด็ก', 'อินเทอร์เน็ต', 'แรง']] + """ + if engine == "symspellpy": + from pythainlp.spell.symspellpy import spell_sent as symspellpy_spell + + list_new = symspellpy_spell(list_words) + else: + _temp = list( + itertools.product(*[spell(i, engine=engine) for i in list_words]) + ) + list_new = [] + for i in _temp: + _temp2 = [] + for j in i: + _temp2.append(j) + list_new.append(_temp2) + + return list_new
+ + + +
+[docs] +def correct_sent(list_words: List[str], engine: str = "pn") -> List[str]: + """ + Corrects and returns the spelling of the given sentence + + :param List[str] list_words: list of words in sentence + :param str engine: + * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. + * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. + * *wanchanberta_thai_grammarly* - WanchanBERTa Thai Grammarly + :return: the corrected list of words in sentence + :rtype: List[str] + + :Example: + :: + + from pythainlp.spell import correct_sent + + correct_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy') + # output: ['เด็ก', 'อินเทอร์เน็ต', 'แรง'] + """ + return spell_sent(list_words, engine=engine)[0]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/spell/pn.html b/5.1/_modules/pythainlp/spell/pn.html new file mode 100644 index 0000000..d79e8fb --- /dev/null +++ b/5.1/_modules/pythainlp/spell/pn.html @@ -0,0 +1,534 @@ + + + + + + + + pythainlp.spell.pn — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.spell.pn

+# -*- coding: utf-8 -*-
+"""
+Spell checker, using Peter Norvig algorithm.
+Spelling dictionary can be customized.
+Default spelling dictionary is based on Thai National Corpus.
+
+Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
+"""
+from collections import Counter
+from string import digits
+from typing import (
+    Callable,
+    Dict,
+    ItemsView,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+from pythainlp import thai_digits, thai_letters
+from pythainlp.corpus import tnc
+from pythainlp.util import isthaichar
+
+
+def _no_filter(word: str) -> bool:
+    return True
+
+
+def _is_thai_and_not_num(word: str) -> bool:
+    for ch in word:
+        if ch != "." and not isthaichar(ch):
+            return False
+        if ch in thai_digits or ch in digits:
+            return False
+    return True
+
+
+def _keep(
+    word_freq: Tuple[str, int],
+    min_freq: int,
+    min_len: int,
+    max_len: int,
+    dict_filter: Callable[[str], bool],
+) -> bool:
+    """
+    Checks whether a given word has the required minimum frequency min_freq
+    and its character length is between min_len and max_len (inclusive).
+    """
+    if not word_freq or word_freq[1] < min_freq:
+        return False
+
+    word = word_freq[0]
+    if not (word and min_len <= len(word) <= max_len and word[0] != "."):
+        return False
+
+    return dict_filter(word)
+
+
+def _edits1(word: str) -> Set[str]:
+    """
+    Returns a set of words with an edit distance of 1 from the input word
+    """
+    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
+    deletes = [L + R[1:] for L, R in splits if R]
+    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
+    replaces = [L + c + R[1:] for L, R in splits if R for c in thai_letters]
+    inserts = [L + c + R for L, R in splits for c in thai_letters]
+
+    return set(deletes + transposes + replaces + inserts)
+
+
+def _edits2(word: str) -> Set[str]:
+    """
+    Returns a set of words with an edit distance of 2 from the input word
+    """
+    return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
+
+
+def _convert_custom_dict(
+    custom_dict: Union[
+        Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
+    ],
+    min_freq: int,
+    min_len: int,
+    max_len: int,
+    dict_filter: Optional[Callable[[str], bool]],
+) -> List[Tuple[str, int]]:
+    """
+    Converts a custom dictionary to a list of (str, int) tuples
+    """
+    if isinstance(custom_dict, dict):
+        custom_dict = list(custom_dict.items())
+
+    i = iter(custom_dict)
+    first_member = next(i)
+    if isinstance(first_member, str):
+        # create tuples of a word with frequency equaling 1,
+        # and filter word list
+        custom_dict = [
+            (word, 1)
+            for word in custom_dict
+            if _keep((word, 1), 1, min_len, max_len, dict_filter)
+        ]
+    elif isinstance(first_member, tuple):
+        # filter word list
+        custom_dict = [
+            word_freq
+            for word_freq in custom_dict
+            if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
+        ]
+    else:
+        raise TypeError(
+            "custom_dict must be either Dict[str, int], "
+            "Iterable[Tuple[str, int]], or Iterable[str]"
+        )
+
+    return custom_dict
+
+
+
+[docs] +class NorvigSpellChecker: +
+[docs] + def __init__( + self, + custom_dict: Union[ + Dict[str, int], Iterable[str], Iterable[Tuple[str, int]] + ] = None, + min_freq: int = 2, + min_len: int = 2, + max_len: int = 40, + dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num, + ): + """ + Initializes Peter Norvig's spell checker object. + Spelling dictionary can be customized. + By default, spelling dictionary is from + `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_ + + Basically, Norvig's spell checker will choose the most likely + corrected spelling given a word by searching for candidates of + corrected words based on edit distance. + Then, it selects the candidate with + the highest word occurrence probability. + + :param str custom_dict: A custom spelling dictionary. This can be: + (1) a dictionary (`dict`), with words (`str`) + as keys and frequencies (`int`) as values; + (2) an iterable (list, tuple, or set) of words + (`str`) and frequency (`int`) tuples: + `(str, int)`; or + (3) an iterable of just words (`str`), without + frequencies -- in this case `1` will be + assigned to every words. + Default is from Thai National Corpus (around + 40,000 words). + :param int min_freq: Minimum frequency of a word to keep (default = 2) + :param int min_len: Minimum length (in characters) of a word to keep + (default = 2) + :param int max_len: Maximum length (in characters) of a word to keep + (default = 40) + :param func dict_filter: A function to filter the dictionary. + Default filter removes any word + with numbers or non-Thai characters. + If no filter is required, use None. + """ + if not custom_dict: # default, use Thai National Corpus + # TODO: #680 change the dict + custom_dict = [(i, j) for i, j in tnc.word_freqs()] + + if not dict_filter: + dict_filter = _no_filter + + custom_dict = _convert_custom_dict( + custom_dict, min_freq, min_len, max_len, dict_filter + ) + + self.__WORDS = Counter(dict(custom_dict)) + self.__WORDS += Counter() # remove zero and negative counts + self.__WORDS_TOTAL = sum(self.__WORDS.values())
+ + +
+[docs] + def dictionary(self) -> ItemsView[str, int]: + """ + Returns the spelling dictionary currently used by this spell checker + + :return: spelling dictionary of this instance + :rtype: list[tuple[str, int]] + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + dictionary= [("หวาน", 30), ("มะนาว", 2), ("แอบ", 3223)] + + checker = NorvigSpellChecker(custom_dict=dictionary) + checker.dictionary() + # output: dict_items([('หวาน', 30), ('มะนาว', 2), ('แอบ', 3223)]) + """ + return self.__WORDS.items()
+ + +
+[docs] + def known(self, words: Iterable[str]) -> List[str]: + """ + Returns a list of given words found in the spelling dictionary + + :param list[str] words: A list of words to check if they exist + in the spelling dictionary + + :return: intersection of the given word list and words + in the spelling dictionary + :rtype: list[str] + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + checker = NorvigSpellChecker() + + checker.known(["เพยน", "เพล", "เพลง"]) + # output: ['เพล', 'เพลง'] + + checker.known(['ยกไ', 'ไฟล์ม']) + # output: [] + + checker.known([]) + # output: [] + """ + return list(w for w in words if w in self.__WORDS)
+ + +
+[docs] + def prob(self, word: str) -> float: + """ + Returns the probability of an input word, + according to the spelling dictionary + + :param str word: A word to check occurrence probability of + + :return: word occurrence probability + :rtype: float + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + checker = NorvigSpellChecker() + + checker.prob("ครัช") + # output: 0.0 + + checker.prob("รัก") + # output: 0.0006959172792052158 + + checker.prob("น่ารัก") + # output: 9.482306849763902e-05 + """ + return self.__WORDS[word] / self.__WORDS_TOTAL
+ + +
+[docs] + def freq(self, word: str) -> int: + """ + Returns the frequency of an input word, + according to the spelling dictionary + + :param str word: A word to check frequency of + :return: frequency of the given word in the spelling dictionary + :rtype: int + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + checker = NorvigSpellChecker() + + checker.freq("ปัญญา") + # output: 3639 + + checker.freq("บิญชา") + # output: 0 + """ + return self.__WORDS[word]
+ + +
+[docs] + def spell(self, word: str) -> List[str]: + """ + Returns a list of all correctly-spelled words whose spelling + is similar to the given word by edit distance metrics. + The returned list of words will be sorted by decreasing + order of word frequencies in the word spelling dictionary. + + First, if the input word is spelled correctly, + this method returns a list of exactly one word which is itself. + Next, this method looks for a list of all correctly spelled words + whose edit distance value is 1 from the input word. + If there is no such word, then the search expands to + a list of words whose edit distance value is 2. + And if that still fails, the list of input words is returned. + + :param str word: A word to check spelling of + + :return: list of possibly correct words within 1 or 2 edit distance + and sorted by frequency of word occurrence in the + spelling dictionary in descending order. + :rtype: list[str] + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + checker = NorvigSpellChecker() + + checker.spell("เส้นตรบ") + # output: ['เส้นตรง'] + + checker.spell("ครัช") + # output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', + # 'วรัช', 'ครัส', 'ปรัช', 'บรัช', 'ครัง', + #'คัช', 'คลัช', 'ครัย', 'ครัด'] + """ + if not word: + return [""] + + candidates = ( + self.known([word]) + or self.known(_edits1(word)) + or self.known(_edits2(word)) + or [word] + ) + candidates.sort(key=self.freq, reverse=True) + + return candidates
+ + +
+[docs] + def correct(self, word: str) -> str: + """ + Returns the most possible word, using the probability from + the spelling dictionary + + :param str word: A word to correct spelling of + + :return: the correct spelling of the given word + :rtype: str + + :Example: + :: + + from pythainlp.spell import NorvigSpellChecker + + checker = NorvigSpellChecker() + + checker.correct("ปัญชา") + # output: 'ปัญหา' + + checker.correct("บิญชา") + # output: 'บัญชา' + + checker.correct("มิตรภาบ") + # output: 'มิตรภาพ' + """ + if not word: + return "" + + # Check for numeric type + try: + if "." in word: + float(word) + else: + int(word) + return word + except ValueError: + pass + + return self.spell(word)[0]
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/summarize/core.html b/5.1/_modules/pythainlp/summarize/core.html new file mode 100644 index 0000000..c39bbb1 --- /dev/null +++ b/5.1/_modules/pythainlp/summarize/core.html @@ -0,0 +1,396 @@ + + + + + + + + pythainlp.summarize.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.summarize.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Text summarization and keyword extraction
+"""
+
+from typing import Iterable, List, Optional, Tuple
+
+from pythainlp.summarize import (
+    CPE_KMUTT_THAI_SENTENCE_SUM,
+    DEFAULT_KEYWORD_EXTRACTION_ENGINE,
+    DEFAULT_SUMMARIZE_ENGINE,
+)
+from pythainlp.summarize.freq import FrequencySummarizer
+from pythainlp.tokenize import sent_tokenize
+
+
+
+[docs] +def summarize( + text: str, + n: int = 1, + engine: str = DEFAULT_SUMMARIZE_ENGINE, + tokenizer: str = "newmm", +) -> List[str]: + """ + This function summarizes text based on frequency of words. + + Under the hood, this function first tokenizes sentences from the given + text with :func:`pythainlp.tokenize.sent_tokenize`. + Then, it computes frequencies of tokenized words + (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences + and normalizes them with maximum word frequency. The words with normalized + frequencies that are less than 0.1 or greater than 0.9 will be + filtered out from frequency dictionary. Finally, it picks *n* sentences + with highest sum of normalized frequency from all words which are + in the sentence and also appear in the frequency dictionary. + + :param str text: text to be summarized + :param int n: number of sentences to be included in the summary + By default, n is *1* (effective for frequency engine only) + :param str engine: text summarization engine (By default: *frequency*). + :param str tokenizer: word tokenizer engine name (refer to + :func:`pythainlp.tokenize.word_tokenize`). + By default, tokenizer is *newmm* + (effective for frequency engine only) + + :return: list of selected sentences + **Options for engine** + * *frequency* (default) - frequency of words + * *mt5* - mT5-small model + * *mt5-small* - mT5-small model + * *mt5-base* - mT5-base model + * *mt5-large* - mT5-large model + * *mt5-xl* - mT5-xl model + * *mt5-xxl* - mT5-xxl model + * *mt5-cpe-kmutt-thai-sentence-sum* - mT5 Thai sentence summarization by CPE KMUTT + + :Example: + :: + + from pythainlp.summarize import summarize + + text = ''' + ทำเนียบท่าช้าง หรือ วังถนนพระอาทิตย์ + ตั้งอยู่บนถนนพระอาทิตย์ เขตพระนคร กรุงเทพมหานคร + เดิมเป็นบ้านของเจ้าพระยามหาโยธา (ทอเรียะ คชเสนี) + บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์ (พญาเจ่ง) + ต้นสกุลคชเสนี เชื้อสายมอญ เจ้าพระยามหาโยธา (ทอเรีย) + เป็นปู่ของเจ้าจอมมารดากลิ่นในพระบาทสมเด็จพระจอมเกล้าเจ้าอยู่หัว + และเป็นมรดกตกทอดมาถึง พระเจ้าบรมวงศ์เธอ กรมพระนเรศรวรฤทธิ์ + (พระองค์เจ้ากฤดาภินิหาร) + ต่อมาในรัชสมัยพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัวโปรดเกล้าฯ + ให้สร้างตำหนัก 2 ชั้น + เป็นที่ประทับของพระเจ้าบรมวงศ์เธอ + กรมพระนเรศวรฤทิธิ์และเจ้าจอมมารดา + ต่อมาเรียกอาคารหลักนี้ว่า ตำหนักเดิม + ''' + + summarize(text, n=1) + # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์'] + + summarize(text, n=3) + # output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์', + # 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา', + # 'เจ้าพระยามหาโยธา'] + + summarize(text, engine="mt5-small") + # output: ['<extra_id_0> ท่าช้าง หรือ วังถนนพระอาทิตย์ + # เขตพระนคร กรุงเทพมหานคร ฯลฯ ดังนี้: + # ที่อยู่ - ศิลปวัฒนธรรม'] + + text = "ถ้าพูดถึงขนมหวานในตำนานที่ชื่นใจที่สุดแล้วละก็ต้องไม่พ้น น้ำแข็งใส แน่ๆ เพราะว่าเป็นอะไรที่ชื่นใจสุดๆ" + summarize(text, engine="mt5-cpe-kmutt-thai-sentence-sum") + # output: ['น้ําแข็งใสเป็นอะไรที่ชื่นใจที่สุด'] + """ + if not text or not isinstance(text, str): + return [] + sents = [] + + if engine == DEFAULT_SUMMARIZE_ENGINE: + sents = FrequencySummarizer().summarize(text, n, tokenizer) + elif engine == CPE_KMUTT_THAI_SENTENCE_SUM: + from .mt5 import mT5Summarizer + + sents = mT5Summarizer( + pretrained_mt5_model_name=CPE_KMUTT_THAI_SENTENCE_SUM, min_length=5 + ).summarize(text) + elif engine.startswith("mt5-") or engine == "mt5": + size = engine.replace("mt5-", "") + from .mt5 import mT5Summarizer + + sents = mT5Summarizer(model_size=size).summarize(text) + else: # if engine not found, return first n sentences + sents = sent_tokenize(text, engine="whitespace+newline")[:n] + + return sents
+ + + +
+[docs] +def extract_keywords( + text: str, + keyphrase_ngram_range: Tuple[int, int] = (1, 2), + max_keywords: int = 5, + min_df: int = 1, + engine: str = DEFAULT_KEYWORD_EXTRACTION_ENGINE, + tokenizer: str = "newmm", + stop_words: Optional[Iterable[str]] = None, +) -> List[str]: + """ + This function returns most-relevant keywords (and/or keyphrases) from the input document. + Each algorithm may produce completely different keywords from each other, + so please be careful when choosing the algorithm. + + *Note*: Calling :func: `extract_keywords()` is expensive. For repetitive use of KeyBERT (the default engine), + creating KeyBERT object is highly recommended. + + :param str text: text to be summarized + :param Tuple[int, int] keyphrase_ngram_range: Number of token units to be defined as keyword. + The token unit varies w.r.t. `tokenizer_engine`. + For instance, (1, 1) means each token (unigram) can be a keyword (e.g. "เสา", "ไฟฟ้า"), + (1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords + (e.g. "เสา", "ไฟฟ้า", "เสาไฟฟ้า") (default: (1, 2)) + :param int max_keywords: Number of maximum keywords to be returned. (default: 5) + :param int min_df: Minimum frequency required to be a keyword. (default: 1) + :param str engine: Name of algorithm to use for keyword extraction. (default: 'keybert') + :param str tokenizer: Name of tokenizer engine to use. + Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: 'newmm') + :param Optional[Iterable[str]] stop_words: A list of stop words (a.k.a words to be ignored). + If not specified, :func:`pythainlp.corpus.thai_stopwords` is used. (default: None) + + :return: list of keywords + + **Options for engine** + * *keybert* (default) - KeyBERT keyword extraction algorithm + * *frequency* - frequency of words + + :Example: + :: + + from pythainlp.summarize import extract_keywords + + text = ''' + อาหาร หมายถึง ของแข็งหรือของเหลว + ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว + จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย + ทำให้ร่างกายเจริญเติบโต + ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย + ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ + อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย + ''' + + keywords = extract_keywords(text) + + # output: ['อวัยวะต่างๆ', + # 'ซ่อมแซมส่วน', + # 'เจริญเติบโต', + # 'ควบคุมการเปลี่ยนแปลง', + # 'มีพิษ'] + + keywords = extract_keywords(text, max_keywords=10) + + # output: ['อวัยวะต่างๆ', + # 'ซ่อมแซมส่วน', + # 'เจริญเติบโต', + # 'ควบคุมการเปลี่ยนแปลง', + # 'มีพิษ', + # 'ทำให้ร่างกาย', + # 'ร่างกายเจริญเติบโต', + # 'จะทำให้เกิด', + # 'มีพิษและ', + # 'เกิดโทษ'] + + """ + + def rank_by_frequency( + text: str, + max_keywords: int = 5, + min_df: int = 5, + tokenizer: str = "newmm", + stop_words: Optional[Iterable[str]] = None, + ): + from pythainlp.tokenize import word_tokenize + from pythainlp.util.keywords import rank + + tokens = word_tokenize(text, engine=tokenizer, keep_whitespace=False) + + use_custom_stop_words = stop_words is not None + + if use_custom_stop_words: + tokens = [token for token in tokens if token not in stop_words] + + word_rank = rank(tokens, exclude_stopwords=not use_custom_stop_words) + + keywords = [ + kw + for kw, cnt in word_rank.most_common(max_keywords) + if cnt >= min_df + ] + + return keywords + + engines = ["keybert", "frequency"] + + if engine == "keybert": + from .keybert import KeyBERT + + keywords = KeyBERT().extract_keywords( + text, + keyphrase_ngram_range=keyphrase_ngram_range, + max_keywords=max_keywords, + min_df=min_df, + tokenizer=tokenizer, + return_similarity=False, + stop_words=stop_words, + ) + elif engine == "frequency": + return rank_by_frequency( + text, + max_keywords=max_keywords, + min_df=min_df, + tokenizer=tokenizer, + stop_words=stop_words, + ) + + else: + # currently not supported + raise ValueError( + f"Keyword extractor {repr(engine)} is currently not supported. " + f"Use one of {engines}." + ) + + return keywords
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/summarize/keybert.html b/5.1/_modules/pythainlp/summarize/keybert.html new file mode 100644 index 0000000..3807f23 --- /dev/null +++ b/5.1/_modules/pythainlp/summarize/keybert.html @@ -0,0 +1,379 @@ + + + + + + + + pythainlp.summarize.keybert — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.summarize.keybert

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Minimal re-implementation of KeyBERT.
+
+KeyBERT is a minimal and easy-to-use keyword extraction technique
+that leverages BERT embeddings to create keywords and keyphrases
+that are most similar to a document.
+
+https://github.com/MaartenGr/KeyBERT
+"""
+from collections import Counter
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+from transformers import pipeline
+
+from pythainlp.corpus import thai_stopwords
+from pythainlp.tokenize import word_tokenize
+
+
+
+[docs] +class KeyBERT: +
+[docs] + def __init__( + self, model_name: str = "airesearch/wangchanberta-base-att-spm-uncased" + ): + self.ft_pipeline = pipeline( + "feature-extraction", + tokenizer=model_name, + model=model_name, + revision="main", + )
+ + +
+[docs] + def extract_keywords( + self, + text: str, + keyphrase_ngram_range: Tuple[int, int] = (1, 2), + max_keywords: int = 5, + min_df: int = 1, + tokenizer: str = "newmm", + return_similarity=False, + stop_words: Optional[Iterable[str]] = None, + ) -> Union[List[str], List[Tuple[str, float]]]: + """ + Extract Thai keywords and/or keyphrases with KeyBERT algorithm. + See https://github.com/MaartenGr/KeyBERT. + + :param str text: text to be summarized + :param Tuple[int, int] keyphrase_ngram_range: Number of token units to be defined as keyword. + The token unit varies w.r.t. `tokenizer_engine`. + For instance, (1, 1) means each token (unigram) can be a keyword (e.g. "เสา", "ไฟฟ้า"), + (1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords + (e.g. "เสา", "ไฟฟ้า", "เสาไฟฟ้า") (default: (1, 2)) + :param int max_keywords: Number of maximum keywords to be returned. (default: 5) + :param int min_df: Minimum frequency required to be a keyword. (default: 1) + :param str tokenizer: Name of tokenizer engine to use. + Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: 'newmm') + :param bool return_similarity: If `True`, return keyword scores. (default: False) + :param Optional[Iterable[str]] stop_words: A list of stop words (a.k.a words to be ignored). + If not specified, :func:`pythainlp.corpus.thai_stopwords` is used. (default: None) + + :return: list of keywords with score + + :Example: + :: + + from pythainlp.summarize.keybert import KeyBERT + + text = ''' + อาหาร หมายถึง ของแข็งหรือของเหลว + ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว + จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย + ทำให้ร่างกายเจริญเติบโต + ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย + ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ + อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย + ''' + + kb = KeyBERT() + + keywords = kb.extract_keyword(text) + + # output: ['อวัยวะต่างๆ', + # 'ซ่อมแซมส่วน', + # 'เจริญเติบโต', + # 'ควบคุมการเปลี่ยนแปลง', + # 'มีพิษ'] + + keywords = kb.extract_keyword(text, max_keywords=10, return_similarity=True) + + # output: [('อวัยวะต่างๆ', 0.3228477063109462), + # ('ซ่อมแซมส่วน', 0.31320597838000375), + # ('เจริญเติบโต', 0.29115434699705506), + # ('ควบคุมการเปลี่ยนแปลง', 0.2678430841321016), + # ('มีพิษ', 0.24996827960821494), + # ('ทำให้ร่างกาย', 0.23876962942443258), + # ('ร่างกายเจริญเติบโต', 0.23191285218852364), + # ('จะทำให้เกิด', 0.22425422716846247), + # ('มีพิษและ', 0.22162962875299588), + # ('เกิดโทษ', 0.20773497763458507)] + + """ + try: + text = text.strip() + except AttributeError: + raise AttributeError( + f"Unable to process data of type {type(text)}. " + f"Please provide input of string type." + ) + + if not text: + return [] + + # generate all lists of keywords / keyphrases + stop_words_ = stop_words if stop_words else thai_stopwords() + kw_candidates = _generate_ngrams( + text, keyphrase_ngram_range, min_df, tokenizer, stop_words_ + ) + + # create document and word vectors + doc_vector = self.embed(text) + kw_vectors = self.embed(kw_candidates) + + # rank keywords + keywords = _rank_keywords( + doc_vector, kw_vectors, kw_candidates, max_keywords + ) + + if return_similarity: + return keywords + else: + return [kw for kw, _ in keywords]
+ + +
+[docs] + def embed(self, docs: Union[str, List[str]]) -> np.ndarray: + """ + Create an embedding of each input in `docs` by averaging vectors from the last hidden layer. + """ + embs = self.ft_pipeline(docs) + if isinstance(docs, str) or len(docs) == 1: + # embed doc. return shape = [1, hidden_size] + emb_mean = np.array(embs).mean(axis=1) + else: + # mean of embedding of each word + # return shape = [len(docs), hidden_size] + emb_mean = np.stack( + [np.array(emb[0]).mean(axis=0) for emb in embs] + ) + + return emb_mean
+
+ + + +def _generate_ngrams( + doc: str, + keyphrase_ngram_range: Tuple[int, int], + min_df: int, + tokenizer_engine: str, + stop_words: Iterable[str], +) -> List[str]: + assert keyphrase_ngram_range[0] >= 1, ( + f"`keyphrase_ngram_range` must start from 1. " + f"current value={keyphrase_ngram_range}." + ) + + assert keyphrase_ngram_range[0] <= keyphrase_ngram_range[1], ( + f"The value first argument of `keyphrase_ngram_range` must not exceed the second. " + f"current value={keyphrase_ngram_range}." + ) + + def _join_ngram(ngrams: List[Tuple[str, str]]) -> List[str]: + ngrams_joined = [] + for ng in ngrams: + joined = "".join(ng) + if joined.strip() == joined: + # ngram must not start or end with whitespace as this may cause duplication. + ngrams_joined.append(joined) + return ngrams_joined + + words = word_tokenize(doc, engine=tokenizer_engine) + all_grams = [] + ngram_range = (keyphrase_ngram_range[0], keyphrase_ngram_range[1] + 1) + for n in range(*ngram_range): + if n == 1: + # filter out space + ngrams = [word for word in words if word.strip()] + else: + ngrams_tuple = zip(*[words[i:] for i in range(n)]) + ngrams = _join_ngram(ngrams_tuple) + + ngrams_cnt = Counter(ngrams) + ngrams = [ + word + for word, freq in ngrams_cnt.items() + if (freq >= min_df) and (word not in stop_words) + ] + all_grams.extend(ngrams) + + return all_grams + + +def _rank_keywords( + doc_vector: np.ndarray, + word_vectors: np.ndarray, + keywords: List[str], + max_keywords: int, +) -> List[Tuple[str, float]]: + def l2_norm(v: np.ndarray) -> np.ndarray: + vec_size = v.shape[1] + result = np.divide( + v, + np.linalg.norm(v, axis=1).reshape(-1, 1).repeat(vec_size, axis=1), + ) + assert np.isclose( + np.linalg.norm(result, axis=1), 1 + ).all(), "Cannot normalize a vector to unit vector." + return result + + def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray: + return (np.matmul(a, b.T).T).sum(axis=1) + + doc_vector = l2_norm(doc_vector) + word_vectors = l2_norm(word_vectors) + cosine_sims = cosine_sim(doc_vector, word_vectors) + ranking_desc = np.argsort(-cosine_sims) + + final_ranks = [ + (keywords[r], cosine_sims[r]) for r in ranking_desc[:max_keywords] + ] + return final_ranks +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tag/chunk.html b/5.1/_modules/pythainlp/tag/chunk.html new file mode 100644 index 0000000..3606902 --- /dev/null +++ b/5.1/_modules/pythainlp/tag/chunk.html @@ -0,0 +1,176 @@ + + + + + + + + pythainlp.tag.chunk — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tag.chunk

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+
+
+[docs] +def chunk_parse( + sent: List[Tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp" +) -> List[str]: + """ + This function parses Thai sentence to phrase structure in IOB format. + + :param list sent: list [(word, part-of-speech)] + :param str engine: chunk parse engine (now, it has crf only) + :param str corpus: chunk parse corpus (now, it has orchidpp only) + + :return: a list of tuples (word, part-of-speech, chunking) + :rtype: List[str] + + :Example: + :: + + from pythainlp.tag import chunk_parse, pos_tag + + tokens = ["ผม", "รัก", "คุณ"] + tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid") + + print(chunk_parse(tokens_pos)) + # output: ['B-NP', 'B-VP', 'I-VP'] + """ + from .crfchunk import CRFchunk + + _engine = CRFchunk() + return _engine.parse(sent)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tag/locations.html b/5.1/_modules/pythainlp/tag/locations.html new file mode 100644 index 0000000..5c3a669 --- /dev/null +++ b/5.1/_modules/pythainlp/tag/locations.html @@ -0,0 +1,179 @@ + + + + + + + + pythainlp.tag.locations — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tag.locations

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Recognizes locations in text
+"""
+
+from typing import List, Tuple
+
+from pythainlp.corpus import provinces
+
+
+
+[docs] +def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]: + """ + This function recognizes Thailand provinces in text. + + Note that it uses exact match and considers no context. + + :param list[str] tokens: a list of words + :return: a list of tuples indicating NER for `LOCATION` in IOB format + :rtype: list[tuple[str, str]] + + :Example: + :: + + from pythainlp.tag import tag_provinces + + text = ['หนองคาย', 'น่าอยู่'] + tag_provinces(text) + # output: [('หนองคาย', 'B-LOCATION'), ('น่าอยู่', 'O')] + """ + province_list = provinces() + output = [ + (token, "B-LOCATION") if token in province_list else (token, "O") + for token in tokens + ] + return output
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tag/named_entity.html b/5.1/_modules/pythainlp/tag/named_entity.html new file mode 100644 index 0000000..595a17a --- /dev/null +++ b/5.1/_modules/pythainlp/tag/named_entity.html @@ -0,0 +1,328 @@ + + + + + + + + pythainlp.tag.named_entity — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tag.named_entity

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Named-entity recognizer
+"""
+from typing import List, Tuple, Union
+
+
+
+[docs] +class NER: + """ + Class of named-entity recognizer + + :param str engine: engine of named-entity recognizer + :param str corpus: corpus + + **Options for engine** + * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default) + * *thainer* - Thai NER engine + * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. + + **Options for corpus** + * *thainer* - Thai NER corpus (default) + + **Note**: The tltk engine supports NER models from tltk only. + """ + +
+[docs] + def __init__(self, engine: str = "thainer-v2", corpus: str = "thainer") -> None: + self.load_engine(engine=engine, corpus=corpus)
+ + +
+[docs] + def load_engine(self, engine: str, corpus: str) -> None: + self.name_engine = engine + self.engine = None + if engine == "thainer" and corpus == "thainer": + from pythainlp.tag.thainer import ThaiNameTagger + + self.engine = ThaiNameTagger() + elif engine == "thainer-v2" and corpus == "thainer": + from pythainlp.wangchanberta import NamedEntityRecognition + self.engine = NamedEntityRecognition(model="pythainlp/thainer-corpus-v2-base-model") + elif engine == "tltk": + from pythainlp.tag import tltk + + self.engine = tltk + elif engine == "wangchanberta" and corpus == "thainer": + from pythainlp.wangchanberta import ThaiNameTagger + + self.engine = ThaiNameTagger(dataset_name=corpus) + elif engine == "phayathaibert" and corpus == "thainer-v2": + from pythainlp.phayathaibert.core import NamedEntityTagger + + self.engine = NamedEntityTagger() + else: + raise ValueError( + "NER class not support {0} engine or {1} corpus.".format( + engine, corpus + ) + )
+ + +
+[docs] + def tag(self, + text, + pos=False, + tag=False + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + """ + This function tags named entities in text in IOB format. + + :param str text: text in Thai to be tagged + :param bool pos: output with part-of-speech tags.\ + (wangchanberta is not supported) + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuples associated with tokenized + words and NER tags + :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] + :Example: + + >>> from pythainlp.tag import NER + >>> + >>> ner = NER("thainer") + >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์") + [('ทดสอบ', 'O'), + ('นาย', 'B-PERSON'), + ('วรรณ', 'I-PERSON'), + ('พงษ์', 'I-PERSON'), + (' ', 'I-PERSON'), + ('ภัททิย', 'I-PERSON'), + ('ไพบูลย์', 'I-PERSON')] + >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) + 'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' + """ + return self.engine.get_ner(text, tag=tag, pos=pos)
+
+ + + +
+[docs] +class NNER: + """ + Nested Named Entity Recognition + + :param str engine: engine of nested named entity recognizer + :param str corpus: corpus + + **Options for engine** + * *thai_nner* - Thai NER engine + """ + +
+[docs] + def __init__(self, engine: str = "thai_nner") -> None: + self.load_engine(engine)
+ + +
+[docs] + def load_engine(self, engine: str = "thai_nner") -> None: + from pythainlp.tag.thai_nner import Thai_NNER + + self.engine = Thai_NNER()
+ + +
+[docs] + def tag(self, text) -> Tuple[List[str], List[dict]]: + """ + This function tags nested named entities. + + :param str text: text in Thai to be tagged + + :return: a list of tuples associated with tokenized words and NNER tags. + :rtype: Tuple[List[str], List[dict]] + + :Example: + + >>> from pythainlp.tag.named_entity import NNER + >>> nner = NNER() + >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า") + ([ + '<s>', + '', + 'แมว', + 'ทํา', + '', + 'อะไร', + 'ตอน', + '', + 'ห้า', + '', + 'โมง', + '', + 'เช้า', + '</s>' + ], + [ + { + 'text': ['', 'ห้า'], + 'span': [7, 9], + 'entity_type': 'cardinal' + }, + { + 'text': ['', 'ห้า', '', 'โมง'], + 'span': [7, 11], + 'entity_type': 'time' + }, + { + 'text': ['', 'โมง'], + 'span': [9, 11], + 'entity_type': 'unit' + } + ]) + """ + return self.engine.tag(text)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tag/pos_tag.html b/5.1/_modules/pythainlp/tag/pos_tag.html new file mode 100644 index 0000000..ae57939 --- /dev/null +++ b/5.1/_modules/pythainlp/tag/pos_tag.html @@ -0,0 +1,404 @@ + + + + + + + + pythainlp.tag.pos_tag — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tag.pos_tag

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+
+
+[docs] +def pos_tag( + words: List[str], engine: str = "perceptron", corpus: str = "orchid" +) -> List[Tuple[str, str]]: + """ + Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'. + + :param list words: a list of tokenized words + :param str engine: + * *perceptron* - perceptron tagger (default) + * *unigram* - unigram tagger + * *wangchanberta* - wangchanberta model. + * *tltk* - TLTK: Thai Language Toolkit (support TNC corpora only.\ + If you choose other corpora, they will be converted to TNC corpora.) + :param str corpus: the corpus that is used to create the language model for tagger + * *orchid* - `ORCHID \ + <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \ + text from Thai academic articles (default) + * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags + * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ + * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ + from `Universal Dependencies <https://universaldependencies.org/>` + * *pud* - `Parallel Universal Dependencies (PUD)\ + <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ + treebanks, natively use Universal POS tags + * *tdtb* - `Thai Discourse Treebank \ + <https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \ + , natively use Universal POS tags + * *tnc* - Thai National Corpus (support tltk engine only) + * *tdtb* - `Thai Discourse Treebank <https://github.com/nlp-chula/thai-discourse-treebank>`_ + * *tud* - `Thai Universal Dependency Treebank (TUD)\ + <https://github.com/nlp-chula/TUD>`_ \ + :return: a list of tuples (word, POS tag) + :rtype: list[tuple[str, str]] + + :Example: + + Tag words with corpus `orchid` (default):: + + from pythainlp.tag import pos_tag + + words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ + 'นายก', 'เชอร์ชิล'] + pos_tag(words) + # output: + # [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'), + # ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'), + # ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')] + + Tag words with corpus `orchid_ud`:: + + from pythainlp.tag import pos_tag + + words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ + 'นายก', 'เชอร์ชิล'] + pos_tag(words, corpus='orchid_ud') + # output: + # [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), + # ('รอด', 'NOUN'), ('ใน', 'ADP'), ('อาคาร', 'NOUN'), + # ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'), + # ('เชอร์ชิล', 'NOUN')] + + Tag words with corpus `pud`:: + + from pythainlp.tag import pos_tag + + words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ + 'นายก', 'เชอร์ชิล'] + pos_tag(words, corpus='pud') + # [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'), + # ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'), + # ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')] + + Tag words with different engines including *perceptron* and *unigram*:: + + from pythainlp.tag import pos_tag + + words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3'] + + pos_tag(words, engine='perceptron', corpus='orchid') + # output: + # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'), + # ('ขา', 'NCMN'), (' ', 'PUNC'), + # ('=', 'PUNC'), ('3', 'NCNM')] + + pos_tag(words, engine='unigram', corpus='pud') + # output: + # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None), + # ('<space>', None), ('<equal>', None), ('3', 'NUM')] + """ + if not words: + return [] + + _support_corpus = [ + "blackboard", + "blackboard_ud", + "orchid", + "orchid_ud", + "pud", + "tdtb", + "tud", + ] + + if engine == "perceptron" and corpus in _support_corpus: + from pythainlp.tag.perceptron import tag as tag_ + elif engine == "tltk": + from pythainlp.tag.tltk import pos_tag as tag_ + + corpus = "tnc" + elif engine == "unigram" and corpus in _support_corpus: # default + from pythainlp.tag.unigram import tag as tag_ + else: + raise ValueError( + "pos_tag not support {0} engine or {1} corpus.".format( + engine, corpus + ) + ) + + word_tags = tag_(words, corpus=corpus) + + return word_tags
+ + + +
+[docs] +def pos_tag_sents( + sentences: List[List[str]], + engine: str = "perceptron", + corpus: str = "orchid", +) -> List[List[Tuple[str, str]]]: + """ + Marks sentences with part-of-speech (POS) tags. + + :param list sentences: a list of lists of tokenized words + :param str engine: + * *perceptron* - perceptron tagger (default) + * *unigram* - unigram tagger + * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ + If you choose other corpora, they will be converted to TNC corpora.) + :param str corpus: the corpus that is used to create the language model for tagger + * *orchid* - `ORCHID \ + <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \ + text from Thai academic articles (default) + * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags + * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ + * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ + from `Universal Dependencies <https://universaldependencies.org/>` + * *pud* - `Parallel Universal Dependencies (PUD)\ + <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ + treebanks, natively use Universal POS tags + * *tnc* - Thai National Corpus (support tltk engine only) + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] + + :Example: + + Labels POS for two sentences:: + + from pythainlp.tag import pos_tag_sents + + sentences = [['เก้าอี้','มี','3','ขา'], \\ + ['นก', 'บิน', 'กลับ', 'รัง']] + pos_tag_sents(sentences, corpus='pud) + # output: + # [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'), + # ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'), + # ('กลับ', 'VERB'), ('รัง', 'NOUN')]] + """ + if not sentences: + return [] + + return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
+ + + +def pos_tag_transformers( + sentence: str, + engine: str = "bert", + corpus: str = "blackboard", +) -> List[List[Tuple[str, str]]]: + """ + Marks sentences with part-of-speech (POS) tags. + + :param str sentence: a list of lists of tokenized words + :param str engine: + * *bert* - BERT: Bidirectional Encoder Representations from Transformers (default) + * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only) + * *phayathaibert* - fine-tuned version of clicknext/phayathaibert \ + on blackboard corpus (support blackboard cotpus only) + * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only) + :param str corpus: the corpus that is used to create the language model for tagger + * *blackboard* - `blackboard treebank (support bert engine only) <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ + * *pud* - `Parallel Universal Dependencies (PUD)\ + <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ + treebanks, natively use Universal POS tags (support wangchanberta and mdeberta engine) + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] + + :Example: + + Labels POS for given sentence:: + + from pythainlp.tag import pos_tag_transformers + + sentences = "แมวทำอะไรตอนห้าโมงเช้า" + pos_tag_transformers(sentences, engine="bert", corpus='blackboard') + # output: + # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] + """ + + try: + from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, + TokenClassificationPipeline, + ) + except ImportError: + raise ImportError( + "Not found transformers! Please install transformers by pip install transformers" + ) + + if not sentence: + return [] + + _blackboard_support_engine = { + "bert": "lunarlist/pos_thai", + "phayathai": "lunarlist/pos_thai_phayathai", + } + + _pud_support_engine = { + "wangchanberta": "Pavarissy/wangchanberta-ud-thai-pud-upos", + "mdeberta": "Pavarissy/mdeberta-v3-ud-thai-pud-upos", + } + + if corpus == "blackboard" and engine in _blackboard_support_engine.keys(): + base_model = _blackboard_support_engine.get(engine) + model = AutoModelForTokenClassification.from_pretrained(base_model) + tokenizer = AutoTokenizer.from_pretrained(base_model) + elif corpus == "pud" and engine in _pud_support_engine.keys(): + base_model = _pud_support_engine.get(engine) + model = AutoModelForTokenClassification.from_pretrained(base_model) + tokenizer = AutoTokenizer.from_pretrained(base_model) + else: + raise ValueError( + "pos_tag_transformers not support {0} engine or {1} corpus.".format( + engine, corpus + ) + ) + + pipeline = TokenClassificationPipeline(model=model, + tokenizer=tokenizer, + aggregation_strategy="simple", + ) + + outputs = pipeline(sentence) + word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]] + return word_tags +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tag/thainer.html b/5.1/_modules/pythainlp/tag/thainer.html new file mode 100644 index 0000000..7fb84b6 --- /dev/null +++ b/5.1/_modules/pythainlp/tag/thainer.html @@ -0,0 +1,354 @@ + + + + + + + + pythainlp.tag.thainer — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tag.thainer

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Named-entity recognizer
+"""
+
+__all__ = ["ThaiNameTagger"]
+
+from typing import Dict, List, Tuple, Union
+
+from pythainlp.corpus import get_corpus_path, thai_stopwords
+from pythainlp.tag import pos_tag
+from pythainlp.tokenize import word_tokenize
+from pythainlp.util import isthai
+
+_TOKENIZER_ENGINE = "mm"
+
+
+def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
+    return word in thai_stopwords()
+
+
+def _doc2features(doc, i) -> Dict:
+    word = doc[i][0]
+    postag = doc[i][1]
+
+    # Features from current word
+    features = {
+        "word.word": word,
+        "word.stopword": _is_stopword(word),
+        "word.isthai": isthai(word),
+        "word.isspace": word.isspace(),
+        "postag": postag,
+        "word.isdigit": word.isdigit(),
+    }
+    if word.isdigit() and len(word) == 5:
+        features["word.islen5"] = True
+
+    # Features from previous word
+    if i > 0:
+        prevword = doc[i - 1][0]
+        prevpostag = doc[i - 1][1]
+        prev_features = {
+            "word.prevword": prevword,
+            "word.previsspace": prevword.isspace(),
+            "word.previsthai": isthai(prevword),
+            "word.prevstopword": _is_stopword(prevword),
+            "word.prevpostag": prevpostag,
+            "word.prevwordisdigit": prevword.isdigit(),
+        }
+        features.update(prev_features)
+    else:
+        features["BOS"] = True  # Special "Beginning of Sequence" tag
+
+    # Features from next word
+    if i < len(doc) - 1:
+        nextword = doc[i + 1][0]
+        nextpostag = doc[i + 1][1]
+        next_features = {
+            "word.nextword": nextword,
+            "word.nextisspace": nextword.isspace(),
+            "word.nextpostag": nextpostag,
+            "word.nextisthai": isthai(nextword),
+            "word.nextstopword": _is_stopword(nextword),
+            "word.nextwordisdigit": nextword.isdigit(),
+        }
+        features.update(next_features)
+    else:
+        features["EOS"] = True  # Special "End of Sequence" tag
+
+    return features
+
+
+
+[docs] +class ThaiNameTagger: + """ + Thai named-entity recognizer or Thai NER. + This function supports Thai NER 1.4 and 1.5 only. + :param str version: Thai NER version. + It supports Thai NER 1.4 & 1.5. + The default value is `1.4 + + :Example: + :: + + from pythainlp.tag.thainer import ThaiNameTagger + + thainer14 = ThaiNameTagger(version="1.4") + thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") + """ + +
+[docs] + def __init__(self, version: str = "1.4") -> None: + """ + Thai named-entity recognizer. + + :param str version: Thai NER version. + It's support Thai NER 1.4 & 1.5. + The default value is `1.4` + """ + from pycrfsuite import Tagger as CRFTagger + + self.crf = CRFTagger() + + if version == "1.4": + self.crf.open(get_corpus_path("thainer-1.4", version="1.4")) + self.pos_tag_name = "orchid_ud" + elif version == "1.5": + self.crf.open(get_corpus_path("thainer", version="1.5")) + self.pos_tag_name = "blackboard"
+ + +
+[docs] + def get_ner( + self, text: str, pos: bool = True, tag: bool = False + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: + """ + This function tags named-entities in text in IOB format. + + :param str text: text in Thai to be tagged + :param bool pos: To include POS tags in the results (`True`) or + exclude (`False`). The default value is `True` + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuples associated with tokenized + words and NER tags + :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str + + :Note: + * For the POS tags to be included in the results, this function + uses :func:`pythainlp.tag.pos_tag` with engine `perceptron` + and corpus `orchid_ud`. + + :Example: + + >>> from pythainlp.tag.thainer import ThaiNameTagger + >>> + >>> ner = ThaiNameTagger() + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") + [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), + ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), + ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), + ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), + ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), + ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), + ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), + ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), + ('น.', 'NOUN', 'I-TIME')] + >>> + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + pos=False) + [('วันที่', 'O'), (' ', 'O'), + ('15', 'B-DATE'), (' ', 'I-DATE'), + ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), + ('61', 'I-DATE'), (' ', 'O'), + ('ทดสอบ', 'O'), ('ระบบ', 'O'), + ('เวลา', 'O'), (' ', 'O'), + ('14', 'B-TIME'), (':', 'I-TIME'), + ('49', 'I-TIME'), (' ', 'I-TIME'), + ('น.', 'I-TIME')] + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + tag=True) + 'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME> + 14:49 น.</TIME>' + """ + tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE) + pos_tags = pos_tag( + tokens, engine="perceptron", corpus=self.pos_tag_name + ) + x_test = ThaiNameTagger.__extract_features(pos_tags) + y = self.crf.tag(x_test) + + sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)] + + if tag: + temp = "" + sent = "" + for idx, (word, ner) in enumerate(sent_ner): + if ner.startswith("B-") and temp != "": + sent += "</" + temp + ">" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "</" + temp + ">" + temp = "" + sent += word + + if idx == len(sent_ner) - 1 and temp != "": + sent += "</" + temp + ">" + + return sent + + if pos: + return [ + (pos_tags[i][0], pos_tags[i][1], data) + for i, data in enumerate(y) + ] + + return sent_ner
+ + + @staticmethod + def __extract_features(doc): + return [_doc2features(doc, i) for i in range(len(doc))]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/attacut.html b/5.1/_modules/pythainlp/tokenize/attacut.html new file mode 100644 index 0000000..0d777e5 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/attacut.html @@ -0,0 +1,200 @@ + + + + + + + + pythainlp.tokenize.attacut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.attacut

+# -*- coding: utf-8 -*
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
+
+:See Also:
+    * `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
+"""
+from typing import Dict, List
+
+from attacut import Tokenizer
+
+
+
+[docs] +class AttacutTokenizer: +
+[docs] + def __init__(self, model="attacut-sc"): + self._MODEL_NAME = "attacut-sc" + + if model == "attacut-c": + self._MODEL_NAME = "attacut-c" + + self._tokenizer = Tokenizer(model=self._MODEL_NAME)
+ + +
+[docs] + def tokenize(self, text: str) -> List[str]: + return self._tokenizer.tokenize(text)
+
+ + + +_tokenizers: Dict[str, AttacutTokenizer] = {} + + +
+[docs] +def segment(text: str, model: str = "attacut-sc") -> List[str]: + """ + Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai + :param str text: text to be tokenized to words + :param str model: model of word tokenizer model + :return: list of words, tokenized from the text + :rtype: list[str] + **Options for model** + * *attacut-sc* (default) using both syllable and character features + * *attacut-c* using only character feature + """ + if not text or not isinstance(text, str): + return [] + + global _tokenizers + if model not in _tokenizers: + _tokenizers[model] = AttacutTokenizer(model) + + return _tokenizers[model].tokenize(text)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/core.html b/5.1/_modules/pythainlp/tokenize/core.html new file mode 100644 index 0000000..6498f2c --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/core.html @@ -0,0 +1,1083 @@ + + + + + + + + pythainlp.tokenize.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Generic functions of tokenizers
+"""
+
+import copy
+import re
+from typing import Iterable, List, Union
+
+from pythainlp.tokenize import (
+    DEFAULT_SENT_TOKENIZE_ENGINE,
+    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
+    DEFAULT_SYLLABLE_DICT_TRIE,
+    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
+    DEFAULT_WORD_DICT_TRIE,
+    DEFAULT_WORD_TOKENIZE_ENGINE,
+)
+from pythainlp.tokenize._utils import (
+    apply_postprocessors,
+    rejoin_formatted_num,
+    strip_whitespace,
+)
+from pythainlp.util.trie import Trie, dict_trie
+
+
+
+[docs] +def word_detokenize( + segments: Union[List[List[str]], List[str]], output: str = "str" +) -> Union[List[str], str]: + """ + Word detokenizer. + + This function will detokenize the list of words in each sentence into text. + + :param str segments: List of sentences, each with a list of words. + :param str output: the output type (str or list) + :return: the Thai text + :rtype: Union[str,List[str]] + :Example: + :: + + from pythainlp.tokenize import word_detokenize + + print(word_detokenize(["เรา", "เล่น"])) + # output: เราเล่น + """ + list_all = [] + + if isinstance(segments[0], str): + segments = [segments] + + from pythainlp import thai_characters + + for i, s in enumerate(segments): + list_sents = [] + add_index = [] + space_index = [] + mark_index = [] + for j, w in enumerate(s): + if j > 0: + # previous word + p_w = s[j - 1] + # if w is number or other language and is not space + if ( + w[0] not in thai_characters + and not w.isspace() + and not p_w.isspace() + ): + list_sents.append(" ") + add_index.append(j) + # if previous word is number or other language and is not space + elif p_w[0] not in thai_characters and not p_w.isspace(): + list_sents.append(" ") + add_index.append(j) + # if word is Thai iteration mark + elif w == "ๆ": + if not p_w.isspace(): + list_sents.append(" ") + mark_index.append(j) + elif w.isspace() and j - 1 not in space_index: + space_index.append(j) + elif j - 1 in mark_index: + list_sents.append(" ") + list_sents.append(w) + list_all.append(list_sents) + + if output == "list": + return list_all + + text = [] + for i in list_all: + text.append("".join(i)) + return " ".join(text)
+ + + +
+[docs] +def word_tokenize( + text: str, + custom_dict: Trie = Trie([]), + engine: str = DEFAULT_WORD_TOKENIZE_ENGINE, + keep_whitespace: bool = True, + join_broken_num: bool = True, +) -> List[str]: + """ + Word tokenizer. + + Tokenizes running text into words (list of strings). + + :param str text: text to be tokenized + :param str engine: name of the tokenizer to be used + :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support) + :param bool keep_whitespace: True to keep whitespace, a common mark + for end of phrase in Thai. + Otherwise, whitespace is omitted. + :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated. + Otherwise, formatted numeric could be wrongly separated. + + :return: list of words + :rtype: List[str] + **Options for engine** + * *attacut* - wrapper for + `AttaCut <https://github.com/PyThaiNLP/attacut>`_., + learning-based approach + * *deepcut* - wrapper for + `DeepCut <https://github.com/rkcosmos/deepcut>`_, + learning-based approach + * *icu* - wrapper for a word tokenizer in + `PyICU <https://gitlab.pyicu.org/main/pyicu>`_., + from ICU (International Components for Unicode), + dictionary-based + * *longest* - dictionary-based, longest matching + * *mm* - "multi-cut", dictionary-based, maximum matching + * *nercut* - dictionary-based, maximal matching, + constrained by Thai Character Cluster (TCC) boundaries, + combining tokens that are parts of the same named-entity + * *newmm* (default) - "new multi-cut", + dictionary-based, maximum matching, + constrained by Thai Character Cluster (TCC) boundaries + with improved TCC rules that are used in newmm. + * *newmm-safe* - newmm, with a mechanism to avoid long + processing time for text with continuously ambiguous breaking points + * *nlpo3* - wrapper for a word tokenizer in + `nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_., + adaptation of newmm in Rust (2.5x faster) + * *oskut* - wrapper for + `OSKut <https://github.com/mrpeerat/OSKut>`_., + Out-of-domain StacKed cut for Word Segmentation + * *sefr_cut* - wrapper for + `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_., + Stacked Ensemble Filter and Refine for Word Segmentation + * *tltk* - wrapper for + `TLTK <https://pypi.org/project/tltk/>`_., + maximum collocation approach + :Note: + - The **custom_dict** parameter only works for \ + *deepcut*, *longest*, *newmm*, and *newmm-safe* engines. + :Example: + + Tokenize text with different tokenizers:: + + from pythainlp.tokenize import word_tokenize + + text = "โอเคบ่พวกเรารักภาษาบ้านเกิด" + + word_tokenize(text, engine="newmm") + # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + + word_tokenize(text, engine='attacut') + # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + + Tokenize text with whitespace omitted:: + + text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว " + + word_tokenize(text, engine="newmm") + # output: + # ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' '] + + word_tokenize(text, engine="newmm", keep_whitespace=False) + # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] + + Join broken formatted numeric (e.g. time, decimals, IP addresses):: + + text = "เงิน1,234บาท19:32น 127.0.0.1" + + word_tokenize(text, engine="attacut", join_broken_num=False) + # output: + # ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ', + # '127', '.', '0', '.', '0', '.', '1'] + + word_tokenize(text, engine="attacut", join_broken_num=True) + # output: + # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1'] + + Tokenize with default and custom dictionaries:: + + from pythainlp.corpus.common import thai_words + from pythainlp.tokenize import dict_trie + + text = 'ชินโซ อาเบะ เกิด 21 กันยายน' + + word_tokenize(text, engine="newmm") + # output: + # ['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ', + # 'เกิด', ' ', '21', ' ', 'กันยายน'] + + custom_dict_japanese_name = set(thai_words() + custom_dict_japanese_name.add('ชินโซ') + custom_dict_japanese_name.add('อาเบะ') + + trie = dict_trie(dict_source=custom_dict_japanese_name) + + word_tokenize(text, engine="newmm", custom_dict=trie)) + # output: + # ['ชินโซ', ' ', 'อาเบะ', ' ', + # 'เกิด', ' ', '21', ' ', 'กันยายน'] + """ + if not text or not isinstance(text, str): + return [] + + segments = [] + + if custom_dict and engine in ( + "attacut", + "icu", + "nercut", + "sefr_cut", + "tltk", + "oskut" + ): + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) + + if engine in ("newmm", "onecut"): + from pythainlp.tokenize.newmm import segment + + segments = segment(text, custom_dict) + elif engine == "newmm-safe": + from pythainlp.tokenize.newmm import segment + + segments = segment(text, custom_dict, safe_mode=True) + elif engine == "attacut": + from pythainlp.tokenize.attacut import segment + + segments = segment(text) + elif engine == "longest": + from pythainlp.tokenize.longest import segment + + segments = segment(text, custom_dict) + elif engine in ("mm", "multi_cut"): + from pythainlp.tokenize.multi_cut import segment + + segments = segment(text, custom_dict) + elif engine == "deepcut": # deepcut can optionally use dictionary + from pythainlp.tokenize.deepcut import segment + + if custom_dict: + custom_dict = list(custom_dict) + segments = segment(text, custom_dict) + else: + segments = segment(text) + elif engine == "icu": + from pythainlp.tokenize.pyicu import segment + + segments = segment(text) + elif engine == "nercut": + from pythainlp.tokenize.nercut import segment + + segments = segment(text) + elif engine == "sefr_cut": + from pythainlp.tokenize.sefr_cut import segment + + segments = segment(text) + elif engine == "tltk": + from pythainlp.tokenize.tltk import segment + + segments = segment(text) + elif engine == "oskut": + from pythainlp.tokenize.oskut import segment + + segments = segment(text) + elif engine == "nlpo3": + from pythainlp.tokenize.nlpo3 import segment + + # Currently cannot handle custom_dict from inside word_tokenize(), + # due to difference in type. + # if isinstance(custom_dict, str): + # segments = segment(text, custom_dict=custom_dict) + # elif not isinstance(custom_dict, str) and not custom_dict: + # raise ValueError( + # f"""Tokenizer \"{engine}\": + # custom_dict must be a str. + # It is a dictionary name as assigned with load_dict(). + # See pythainlp.tokenize.nlpo3.load_dict()""" + # ) + # else: + # segments = segment(text) + segments = segment(text) + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + + postprocessors = [] + if join_broken_num: + postprocessors.append(rejoin_formatted_num) + + if not keep_whitespace: + postprocessors.append(strip_whitespace) + + segments = apply_postprocessors(segments, postprocessors) + + return segments
+ + + +def indices_words(words): + indices = [] + start_index = 0 + for word in words: + end_index = start_index + len(word) - 1 + indices.append((start_index, end_index)) + start_index += len(word) + + return indices + + +def map_indices_to_words(index_list, sentences): + result = [] + c = copy.copy(index_list) + n_sum = 0 + for sentence in sentences: + words = sentence + sentence_result = [] + n = 0 + for start, end in c: + if start > n_sum + len(words) - 1: + break + else: + word = sentence[start - n_sum : end + 1 - n_sum] + sentence_result.append(word) + n += 1 + + result.append(sentence_result) + n_sum += len(words) + for _ in range(n): + del c[0] + return result + + +
+[docs] +def sent_tokenize( + text: Union[str, List[str]], + engine: str = DEFAULT_SENT_TOKENIZE_ENGINE, + keep_whitespace: bool = True, +) -> List[str]: + """ + Sentence tokenizer. + + Tokenizes running text into "sentences". Supports both string and list of strings. + + :param text: the text (string) or list of words (list of strings) to be tokenized + :param str engine: choose among *'crfcut'*, *'whitespace'*, \ + *'whitespace+newline'* + :return: list of split sentences + :rtype: list[str] + **Options for engine** + * *crfcut* - (default) split by CRF trained on TED dataset + * *thaisum* - The implementation of sentence segmenter from \ + Nakhun Chumpolsathien, 2020 + * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_., + * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \ + It supports many sizes of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. + * *whitespace+newline* - split by whitespace and newline. + * *whitespace* - split by whitespace, specifically with \ + :class:`regex` pattern ``r" +"`` + :Example: + + Split the text based on *whitespace*:: + + from pythainlp.tokenize import sent_tokenize + + sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" + sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ + และได้รับมอบหมายให้ประจำในระดับภูมิภาค" + + sent_tokenize(sentence_1, engine="whitespace") + # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม'] + + sent_tokenize(sentence_2, engine="whitespace") + # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ', + # '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค'] + + Split the text based on *whitespace* and *newline*:: + + sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" + sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ + และได้รับมอบหมายให้ประจำในระดับภูมิภาค" + + sent_tokenize(sentence_1, engine="whitespace+newline") + # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม'] + sent_tokenize(sentence_2, engine="whitespace+newline") + # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ', + '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค'] + + Split the text using CRF trained on TED dataset:: + + sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" + sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ + และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค" + + sent_tokenize(sentence_1, engine="crfcut") + # output: ['ฉันไปประชุมเมื่อวันที่ 11 มีนาคม'] + + sent_tokenize(sentence_2, engine="crfcut") + # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ ', + 'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค'] + """ + + if not text or not isinstance(text, (str, list)): + return [] + + is_list_input = isinstance(text, list) + + if is_list_input: + try: + original_text = "".join(text) + except ValueError: + return [] + + else: + original_text = text + + segments = [] + + if engine == "crfcut": + from pythainlp.tokenize.crfcut import segment + + segments = segment(original_text) + + if is_list_input: + word_indices = indices_words(text) + result = map_indices_to_words(word_indices, [original_text]) + return result + elif engine == "whitespace": + segments = re.split(r" +", original_text, flags=re.U) + if is_list_input: + result = [] + _temp: list[str] = [] + for i, w in enumerate(text): + if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []: + if not _temp: + continue + result.append(_temp) + _temp = [] + else: + _temp.append(w) + if i + 1 == len(text): + result.append(_temp) + return result + elif engine == "whitespace+newline": + segments = original_text.split() + if is_list_input: + result = [] + _temp = [] + for i, w in enumerate(text): + if ( + re.findall(r"\s", w) != [] or re.findall(r"\n", w) != [] + ) and re.findall(r"\w", w) == []: + if not _temp: + continue + result.append(_temp) + _temp = [] + else: + _temp.append(w) + if i + 1 == len(text): + result.append(_temp) + return result + elif engine == "tltk": + from pythainlp.tokenize.tltk import sent_tokenize as segment + + segments = segment(original_text) + elif engine == "thaisum": + from pythainlp.tokenize.thaisumcut import ( + ThaiSentenceSegmentor as segmentor, + ) + + segment = segmentor() + segments = segment.split_into_sentences(original_text) + elif engine.startswith("wtp"): + if "-" not in engine: + _size = "mini" + else: + _size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment + + segments = segment(original_text, size=_size, tokenize="sentence") + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + + if not keep_whitespace: + segments = strip_whitespace(segments) + + if is_list_input and engine not in ["crfcut"]: + word_indices = indices_words(text) + result = map_indices_to_words(word_indices, segments) + return result + else: + return segments
+ + + +
+[docs] +def paragraph_tokenize( + text: str, + engine: str = "wtp-mini", + paragraph_threshold: float = 0.5, + style: str = "newline", +) -> List[List[str]]: + """ + Paragraph tokenizer. + + Tokenizes text into paragraphs. + + :param str text: text to be tokenized + :param str engine: the name of paragraph tokenizer + :return: list of paragraphs + :rtype: List[List[str]] + **Options for engine** + * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \ + It supports many sizes of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. + + :Example: + + Split the text based on *wtp*:: + + from pythainlp.tokenize import paragraph_tokenize + + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต" + +" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + + paragraph_tokenize(sent) + # output: [ + # ['(1) '], + # [ + # 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ', + # 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ', + # 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ', + # 'ณ ที่นี้' + # ]] + """ + if engine.startswith("wtp"): + if "-" not in engine: + size = "mini" + else: + size = engine.split("-")[-1] + + from pythainlp.tokenize.wtsplit import tokenize as segment + + segments = segment( + text, + size=size, + tokenize="paragraph", + paragraph_threshold=paragraph_threshold, + style=style, + ) + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + + return segments
+ + + +
+[docs] +def subword_tokenize( + text: str, + engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE, + keep_whitespace: bool = True, +) -> List[str]: + """ + Subword tokenizer for tokenizing text into units smaller than syllables. + + Tokenizes text into inseparable units of + Thai contiguous characters, namely + `Thai Character Clusters (TCCs) \ + <https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval>`_ + TCCs are units based on Thai spelling features that could not be + separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'. + If the following units are separated, they could not be spelled out. + This function applies TCC rules to tokenize the text into + the smallest units. + + For example, the word 'ขนมชั้น' would be tokenized + into 'ข', 'น', 'ม', and 'ชั้น'. + + :param str text: text to be tokenized + :param str engine: the name of subword tokenizer + :param bool keep_whitespace: keep whitespace + :return: list of subwords + :rtype: List[str] + **Options for engine** + * *dict* - newmm word tokenizer with a syllable dictionary + * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) + * *han_solo* - CRF syllable segmenter for Thai that can work in the \ + Thai social media domain. See `PyThaiNLP/Han-solo \ + <https://github.com/PyThaiNLP/Han-solo>`_. + * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \ + <https://github.com/ponrawee/ssg>`_. + * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) + * *tcc_p* - Thai Character Cluster + improved rules that are used in newmm + * *tltk* - syllable tokenizer from tltk. See `tltk \ + <https://pypi.org/project/tltk/>`_. + * *wangchanberta* - SentencePiece from wangchanberta model + :Example: + + Tokenize text into subwords based on *tcc*:: + + from pythainlp.tokenize import subword_tokenize + + text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + text_2 = "ความแปลกแยกและพัฒนาการ" + + subword_tokenize(text_1, engine='tcc') + # output: ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก', + # 'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง', + # 'ศ', '์', 'ห', 'มิ', 'ง'] + + subword_tokenize(text_2, engine='tcc') + # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', + 'และ', 'พัฒ','นา', 'กา', 'ร'] + + Tokenize text into subwords based on *etcc*:: + + text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + text_2 = "ความแปลกแยกและพัฒนาการ" + + subword_tokenize(text_1, engine='etcc') + # output: ['ยุคเริ่มแรกของ ราชวงศ์หมิง'] + + subword_tokenize(text_2, engine='etcc') + # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ'] + + Tokenize text into subwords based on *wangchanberta*:: + + text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + text_2 = "ความแปลกแยกและพัฒนาการ" + + subword_tokenize(text_1, engine='wangchanberta') + # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง'] + + subword_tokenize(text_2, engine='wangchanberta') + # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ'] + """ + if not text or not isinstance(text, str): + return [] + + segments = [] + + if engine == "tcc": + from pythainlp.tokenize.tcc import segment + elif engine == "tcc_p": + from pythainlp.tokenize.tcc_p import segment + elif engine == "etcc": + from pythainlp.tokenize.etcc import segment + elif engine == "wangchanberta": + from pythainlp.wangchanberta import segment + elif engine == "dict": # use syllable dictionary + words = word_tokenize(text) + for word in words: + segments.extend( + word_tokenize( + text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE + ) + ) + elif engine == "ssg": + from pythainlp.tokenize.ssg import segment + elif engine == "tltk": + from pythainlp.tokenize.tltk import syllable_tokenize as segment + elif engine == "han_solo": + from pythainlp.tokenize.han_solo import segment + elif engine == "phayathai": + from pythainlp.phayathaibert import segment + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + + if not segments: + segments = segment(text) + + if not keep_whitespace: + segments = strip_whitespace(segments) + + return segments
+ + + +
+[docs] +def syllable_tokenize( + text: str, + engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE, + keep_whitespace: bool = True, +) -> List[str]: + """ + Syllable tokenizer + + Tokenizes text into inseparable units of + Thai syllables. + + :param str text: text to be tokenized + :param str engine: the name of syllable tokenizer + :param bool keep_whitespace: keep whitespace + :return: list of subwords + :rtype: List[str] + **Options for engine** + * *dict* - newmm word tokenizer with a syllable dictionary + * *han_solo* - CRF syllable segmenter for Thai that can work in the \ + Thai social media domain. See `PyThaiNLP/Han-solo \ + <https://github.com/PyThaiNLP/Han-solo>`_. + * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \ + <https://github.com/ponrawee/ssg>`_. + * *tltk* - syllable tokenizer from tltk. See `tltk \ + <https://pypi.org/project/tltk/>`_. + """ + if engine not in ["dict", "han_solo", "ssg", "tltk"]: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + return subword_tokenize( + text=text, engine=engine, keep_whitespace=keep_whitespace + )
+ + + +
+[docs] +def display_cell_tokenize(text: str) -> List[str]: + """ + Display cell tokenizer. + + Tokenizes Thai text into display cells without splitting tone marks. + + :param str text: text to be tokenized + :return: list of display cells + :rtype: List[str] + :Example: + + Tokenize Thai text into display cells:: + + from pythainlp.tokenize import display_cell_tokenize + + text = "แม่น้ำอยู่ที่ไหน" + display_cell_tokenize(text) + # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น'] + """ + if not text or not isinstance(text, str): + return [] + + display_cells = [] + current_cell = "" + text = text.replace("ำ", "ํา") + + for char in text: + if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char): + current_cell += char + else: + if current_cell: + display_cells.append(current_cell) + current_cell = char + + if current_cell: + display_cells.append(current_cell) + + return display_cells
+ + + +
+[docs] +class Tokenizer: + """ + Tokenizer class for a custom tokenizer. + + This class allows users to pre-define custom dictionary along with + tokenizer and encapsulate them into one single object. + It is an wrapper for both functions, that are + :func:`pythainlp.tokenize.word_tokenize`, + and :func:`pythainlp.util.dict_trie` + + :Example: + + Tokenizer object instantiated with :class:`pythainlp.util.Trie`:: + + from pythainlp.tokenize import Tokenizer + from pythainlp.corpus.common import thai_words + from pythainlp.util import dict_trie + + custom_words_list = set(thai_words()) + custom_words_list.add('อะเฟเซีย') + custom_words_list.add('Aphasia') + trie = dict_trie(dict_source=custom_words_list) + + text = "อะเฟเซีย (Aphasia*) เป็นอาการผิดปกติของการพูด" + _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') + _tokenizer.word_tokenize(text) + # output: ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', + 'ผิดปกติ', 'ของ', 'การ', 'พูด'] + + Tokenizer object instantiated with a list of words:: + + text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" + _tokenizer = Tokenizer(custom_dict=list(thai_words()), engine='newmm') + _tokenizer.word_tokenize(text) + # output: + # ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', + # 'ผิดปกติ', 'ของ', 'การ', 'พูด'] + + Tokenizer object instantiated with a file path containing a list of + words separated with *newline* and explicitly setting a new tokenizer + after initiation:: + + PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt' + + # write a file + with open(PATH_TO_CUSTOM_DICTIONARY, 'w', encoding='utf-8') as f: + f.write('อะเฟเซีย\\nAphasia\\nผิด\\nปกติ') + + text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" + + # initiate an object from file with `attacut` as tokenizer + _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\ + engine='attacut') + + _tokenizer.word_tokenize(text) + # output: + # ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิด', + # 'ปกติ', 'ของ', 'การ', 'พูด'] + + # change tokenizer to `newmm` + _tokenizer.set_tokenizer_engine(engine='newmm') + _tokenizer.word_tokenize(text) + # output: + # ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็นอาการ', 'ผิด', + # 'ปกติ', 'ของการพูด'] + """ + +
+[docs] + def __init__( + self, + custom_dict: Union[Trie, Iterable[str], str] = [], + engine: str = "newmm", + keep_whitespace: bool = True, + join_broken_num: bool = True, + ): + """ + Initialize tokenizer object. + + :param str custom_dict: a file path, a list of vocaburaies* to be + used to create a trie, or an instantiated + :class:`pythainlp.util.Trie` object. + :param str engine: choose between different options of tokenizer engines + (i.e. *newmm*, *mm*, *longest*, *deepcut*) + :param bool keep_whitespace: True to keep whitespace, a common mark + for end of phrase in Thai + """ + self.__trie_dict = Trie([]) + if custom_dict: + self.__trie_dict = dict_trie(custom_dict) + else: + self.__trie_dict = DEFAULT_WORD_DICT_TRIE + self.__engine = engine + if self.__engine not in ["newmm", "mm", "longest", "deepcut"]: + raise NotImplementedError( + """ + The Tokenizer class is not support %s for custom tokenizer + """ + % self.__engine + ) + self.__keep_whitespace = keep_whitespace + self.__join_broken_num = join_broken_num
+ + +
+[docs] + def word_tokenize(self, text: str) -> List[str]: + """ + Main tokenization function. + + :param str text: text to be tokenized + :return: list of words, tokenized from the text + :rtype: list[str] + """ + return word_tokenize( + text, + custom_dict=self.__trie_dict, + engine=self.__engine, + keep_whitespace=self.__keep_whitespace, + join_broken_num=self.__join_broken_num, + )
+ + +
+[docs] + def set_tokenize_engine(self, engine: str) -> None: + """ + Set the tokenizer's engine. + + :param str engine: choose between different options of tokenizer engines + (i.e. *newmm*, *mm*, *longest*, *deepcut*) + """ + self.__engine = engine
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/crfcut.html b/5.1/_modules/pythainlp/tokenize/crfcut.html new file mode 100644 index 0000000..f6f5403 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/crfcut.html @@ -0,0 +1,365 @@ + + + + + + + + pythainlp.tokenize.crfcut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.crfcut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+CRFCut - Thai sentence segmenter.
+
+Thai sentence segmentation using conditional random field,
+with default model trained on TED dataset
+
+Performance:
+- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art
+  (Zhou et al, 2016; https://www.aclweb.org/anthology/C16-1031.pdf)
+- TED dataset - space-correct accuracy 82%
+
+See development notebooks at https://github.com/vistec-AI/ted_crawler;
+POS features are not used due to unreliable POS tagging available
+"""
+
+import os
+from typing import List
+
+import pycrfsuite
+
+from pythainlp.corpus import corpus_path
+from pythainlp.tokenize import word_tokenize
+
+_ENDERS = {
+    # ending honorifics
+    "ครับ",
+    "ค่ะ",
+    "คะ",
+    "นะคะ",
+    "นะ",
+    "จ้ะ",
+    "จ้า",
+    "จ๋า",
+    "ฮะ",
+    # enders
+    "ๆ",
+    "ได้",
+    "แล้ว",
+    "ด้วย",
+    "เลย",
+    "มาก",
+    "น้อย",
+    "กัน",
+    "เช่นกัน",
+    "เท่านั้น",
+    "อยู่",
+    "ลง",
+    "ขึ้น",
+    "มา",
+    "ไป",
+    "ไว้",
+    "เอง",
+    "อีก",
+    "ใหม่",
+    "จริงๆ",
+    "บ้าง",
+    "หมด",
+    "ทีเดียว",
+    "เดียว",
+    # demonstratives
+    "นั้น",
+    "นี้",
+    "เหล่านี้",
+    "เหล่านั้น",
+    # questions
+    "อย่างไร",
+    "ยังไง",
+    "หรือไม่",
+    "มั้ย",
+    "ไหน",
+    "ไหม",
+    "อะไร",
+    "ทำไม",
+    "เมื่อไหร่",
+    "เมื่อไร",
+}
+_STARTERS = {
+    # pronouns
+    "ผม",
+    "ฉัน",
+    "ดิฉัน",
+    "ชั้น",
+    "คุณ",
+    "มัน",
+    "เขา",
+    "เค้า",
+    "เธอ",
+    "เรา",
+    "พวกเรา",
+    "พวกเขา",
+    "กู",
+    "มึง",
+    "แก",
+    "ข้าพเจ้า",
+    # connectors
+    "และ",
+    "หรือ",
+    "แต่",
+    "เมื่อ",
+    "ถ้า",
+    "ใน",
+    "ด้วย",
+    "เพราะ",
+    "เนื่องจาก",
+    "ซึ่ง",
+    "ไม่",
+    "ตอนนี้",
+    "ทีนี้",
+    "ดังนั้น",
+    "เพราะฉะนั้น",
+    "ฉะนั้น",
+    "ตั้งแต่",
+    "ในที่สุด",
+    "ก็",
+    "กับ",
+    "แก่",
+    "ต่อ",
+    # demonstratives
+    "นั้น",
+    "นี้",
+    "เหล่านี้",
+    "เหล่านั้น",
+}
+
+
+
+[docs] +def extract_features( + doc: List[str], window: int = 2, max_n_gram: int = 3 +) -> List[List[str]]: + """ + Extract features for CRF by sliding `max_n_gram` of tokens + for +/- `window` from the current token + + :param List[str] doc: tokens from which features are to be extracted + :param int window: size of window before and after the current token + :param int max_n_gram: create n_grams from 1-gram to `max_n_gram`-gram \ + within the `window` + :return: list of lists of features to be fed to CRF + """ + doc_features = [] + doc = ( + ["xxpad" for i in range(window)] + + doc + + ["xxpad" for i in range(window)] + ) + + # add enders and starters + doc_ender = [] + doc_starter = [] + for i in range(len(doc)): + if doc[i] in _ENDERS: + doc_ender.append("ender") + else: + doc_ender.append("normal") + + if doc[i] in _STARTERS: + doc_starter.append("starter") + else: + doc_starter.append("normal") + + # for each word + for i in range(window, len(doc) - window): + # bias term + word_features = ["bias"] + # ngram features + for n_gram in range(1, min(max_n_gram + 1, 2 + window * 2)): + for j in range(i - window, i + window + 2 - n_gram): + feature_position = f"{n_gram}_{j-i}_{j-i+n_gram}" + word_ = f'{"|".join(doc[j:(j+n_gram)])}' + word_features += [f"word_{feature_position}={word_}"] + ender_ = f'{"|".join(doc_ender[j:(j+n_gram)])}' + word_features += [f"ender_{feature_position}={ender_}"] + starter_ = f'{"|".join(doc_starter[j:(j+n_gram)])}' + word_features += [f"starter_{feature_position}={starter_}"] + # append to feature per word + doc_features.append(word_features) + + return doc_features
+ + + +_CRFCUT_DATA_FILENAME = "sentenceseg_crfcut.model" +_tagger = pycrfsuite.Tagger() +_tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME)) + + +
+[docs] +def segment(text: str) -> List[str]: + """ + CRF-based sentence segmentation. + + :param str text: text to be tokenized into sentences + :return: list of words, tokenized from the text + """ + if isinstance(text, str): + toks = word_tokenize(text) + else: + toks = text + feat = extract_features(toks) + labs = _tagger.tag(feat) + labs[-1] = "E" # make sure it cuts the last sentence + + # To ensure splitting of sentences using Terminal Punctuation + for idx, _ in enumerate(toks): + if toks[idx].strip().endswith(("!", ".", "?")): + labs[idx] = "E" + # Spaces or empty strings would no longer be treated as end of sentence. + elif (idx == 0 or labs[idx-1] == "E") and toks[idx].strip() == "": + labs[idx] = "I" + + sentences = [] + sentence = "" + for i, w in enumerate(toks): + sentence = sentence + w + # Empty strings should not be part of output. + if labs[i] == "E" and sentence != "": + sentences.append(sentence) + sentence = "" + + return sentences
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/etcc.html b/5.1/_modules/pythainlp/tokenize/etcc.html new file mode 100644 index 0000000..c58955a --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/etcc.html @@ -0,0 +1,208 @@ + + + + + + + + pythainlp.tokenize.etcc — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.etcc

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Segmenting text into Enhanced Thai Character Clusters (ETCCs)
+Python implementation by Wannaphong Phatthiyaphaibun
+
+This implementation relies on a dictionary of ETCC created from etcc.txt
+in pythainlp/corpus.
+
+Notebook:
+https://colab.research.google.com/drive/1UTQgxxMRxOr9Jp1B1jcq1frBNvorhtBQ
+
+:See Also:
+
+Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and
+Para Limmaneepraserth. "Thai word segmentation using combination of forward
+and backward longest matching techniques." In International Symposium on
+Communications and Information Technology (ISCIT), pp. 37-40. 2001.
+"""
+import re
+from typing import List
+
+from pythainlp import thai_follow_vowels
+from pythainlp.corpus import get_corpus
+from pythainlp.tokenize import Tokenizer
+
+_cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest")
+_PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]"
+_RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR)
+
+
+def _cut_subword(tokens: List[str]) -> List[str]:
+    len_tokens = len(tokens)
+    i = 0
+    while True:
+        if i == len_tokens:
+            break
+        if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1:
+            tokens[i - 1] += tokens[i]
+            del tokens[i]
+            len_tokens -= 1
+        i += 1
+    return tokens
+
+
+
+[docs] +def segment(text: str) -> List[str]: + """ + Segmenting text into ETCCs. + + Enhanced Thai Character Cluster (ETCC) is a kind of subword unit. + The concept was presented in Inrut, Jeeragone, Patiroop Yuanghirun, + Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. + "Thai word segmentation using combination of forward and backward + longest matching techniques." In International Symposium on Communications + and Information Technology (ISCIT), pp. 37-40. 2001. + + :param str text: text to be tokenized into character clusters + :return: list of clusters, tokenized from the text + :return: List[str] + """ + + if not text or not isinstance(text, str): + return [] + + return _cut_subword(_cut_etcc.word_tokenize(text))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/han_solo.html b/5.1/_modules/pythainlp/tokenize/han_solo.html new file mode 100644 index 0000000..2da3e79 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/han_solo.html @@ -0,0 +1,284 @@ + + + + + + + + pythainlp.tokenize.han_solo — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.han_solo

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileCopyrightText: Copyright 2019 Ponrawee Prasertsom
+# SPDX-License-Identifier: Apache-2.0
+"""
+🪿 Han-solo: Thai syllable segmenter
+
+GitHub: https://github.com/PyThaiNLP/Han-solo
+"""
+from typing import List
+
+from pythainlp.corpus import path_pythainlp_corpus
+
+try:
+    import pycrfsuite
+except ImportError:
+    raise ImportError(
+        "ImportError; Install pycrfsuite by pip install python-crfsuite"
+    )
+
+tagger = pycrfsuite.Tagger()
+tagger.open(path_pythainlp_corpus("han_solo.crfsuite"))
+
+
+
+[docs] +class Featurizer: + # This class from ssg at https://github.com/ponrawee/ssg. + +
+[docs] + def __init__(self, N=2, sequence_size=1, delimiter=None): + self.N = N + self.delimiter = delimiter + self.radius = N + sequence_size
+ + +
+[docs] + def pad(self, sentence, padder="#"): + return padder * (self.radius) + sentence + padder * (self.radius)
+ + +
+[docs] + def featurize( + self, sentence, padding=True, indiv_char=True, return_type="list" + ): + if padding: + sentence = self.pad(sentence) + all_features = [] + all_labels = [] + skip_next = False + for current_position in range( + self.radius, len(sentence) - self.radius + 1 + ): + if skip_next: + skip_next = False + continue + features = {} + if return_type == "list": + features = [] + cut = 0 + char = sentence[current_position] + if char == self.delimiter: + cut = 1 + skip_next = True + counter = 0 + chars_left = "" + chars_right = "" + chars = "" + abs_index_left = current_position # left start at -1 + abs_index_right = current_position - 1 # right start at 0 + while counter < self.radius: + abs_index_left -= ( + 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5) + ) + char_left = sentence[abs_index_left] + while char_left == self.delimiter: + abs_index_left -= 1 + char_left = sentence[abs_index_left] + relative_index_left = -counter - 1 + # เก็บตัวหนังสือ + chars_left = char_left + chars_left + # ใส่ลง feature + if indiv_char: + left_key = "|".join([str(relative_index_left), char_left]) + if return_type == "dict": + features[left_key] = 1 + else: + features.append(left_key) + + abs_index_right += ( + 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5) + ) + char_right = sentence[abs_index_right] + while char_right == self.delimiter: + abs_index_right += 1 + char_right = sentence[abs_index_right] + relative_index_right = counter + chars_right += char_right + if indiv_char: + right_key = "|".join( + [str(relative_index_right), char_right] + ) + if return_type == "dict": + features[right_key] = 1 + else: + features.append(right_key) + + counter += 1 + + chars = chars_left + chars_right + for i in range(0, len(chars) - self.N + 1): + ngram = chars[i : i + self.N] + ngram_key = "|".join([str(i - self.radius), ngram]) + if return_type == "dict": + features[ngram_key] = 1 + else: + features.append(ngram_key) + all_features.append(features) + if return_type == "list": + cut = str(cut) + all_labels.append(cut) + + return {"X": all_features, "Y": all_labels}
+
+ + + +_to_feature = Featurizer() + + +
+[docs] +def segment(text: str) -> List[str]: + x = _to_feature.featurize(text)["X"] + y_pred = tagger.tag(x) + list_cut = [] + for j, k in zip(list(text), y_pred): + if k == "1": + list_cut.append(j) + else: + list_cut[-1] += j + return list_cut
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/longest.html b/5.1/_modules/pythainlp/tokenize/longest.html new file mode 100644 index 0000000..7ffdb40 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/longest.html @@ -0,0 +1,324 @@ + + + + + + + + pythainlp.tokenize.longest — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.longest

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Dictionary-based longest-matching Thai word segmentation. Implementation is based
+on the codes from Patorn Utenpattanun.
+
+:See Also:
+    * `GitHub Repository \
+       <https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py>`_
+
+"""
+import re
+from typing import Dict, List, Union
+
+from pythainlp import thai_tonemarks
+from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
+from pythainlp.util import Trie
+
+_FRONT_DEP_CHAR = [
+    "ะ",
+    "ั",
+    "า ",
+    "ำ",
+    "ิ",
+    "ี",
+    "ึ",
+    "ื",
+    "ุ",
+    "ู",
+    "ๅ",
+    "็",
+    "์",
+    "ํ",
+]
+_REAR_DEP_CHAR = ["ั", "ื", "เ", "แ", "โ", "ใ", "ไ", "ํ"]
+_TRAILING_CHAR = ["ๆ", "ฯ"]
+
+_RE_NONTHAI = re.compile(r"[A-Za-z\d]*")
+
+_KNOWN = True
+_UNKNOWN = False
+
+
+
+[docs] +class LongestMatchTokenizer: +
+[docs] + def __init__(self, trie: Trie): + self.__trie = trie
+ + + @staticmethod + def __search_nonthai(text: str) -> Union[None, str]: + match = _RE_NONTHAI.search(text) + if match.group(0): + return match.group(0).lower() + return None + + def __is_next_word_valid(self, text: str, begin_pos: int) -> bool: + text = text[begin_pos:].strip() + + if not text: + return True + + match = self.__search_nonthai(text) + if match: + return True + + for pos in range(len(text) + 1): + if text[0:pos] in self.__trie: + return True + + return False + + def __longest_matching(self, text: str, begin_pos: int) -> str: + text = text[begin_pos:] + + match = self.__search_nonthai(text) + if match: + return match + + word = None + word_valid = None + + for pos in range(len(text) + 1): + w = text[0:pos] + if w in self.__trie: + word = w + if self.__is_next_word_valid(text, pos): + word_valid = w + + if word: + if not word_valid: + word_valid = word + + try: + len_word_valid = len(word_valid) + if text[len_word_valid] in _TRAILING_CHAR: + return text[0 : len_word_valid + 1] + else: + return word_valid + except BaseException: + return word_valid + else: + return "" + + def __segment(self, text: str): + begin_pos = 0 + len_text = len(text) + tokens = [] + token_statuses = [] + while begin_pos < len_text: + match = self.__longest_matching(text, begin_pos) + if not match: + if ( + begin_pos != 0 + and not text[begin_pos].isspace() + and ( + text[begin_pos] in _FRONT_DEP_CHAR + or text[begin_pos - 1] in _REAR_DEP_CHAR + or text[begin_pos] in thai_tonemarks + or (token_statuses and token_statuses[-1] == _UNKNOWN) + ) + ): + tokens[-1] += text[begin_pos] + token_statuses[-1] = _UNKNOWN + else: + tokens.append(text[begin_pos]) + token_statuses.append(_UNKNOWN) + begin_pos += 1 + else: + if begin_pos != 0 and text[begin_pos - 1] in _REAR_DEP_CHAR: + tokens[-1] += match + else: + tokens.append(match) + token_statuses.append(_KNOWN) + begin_pos += len(match) + + # Group consecutive spaces into one token + grouped_tokens = [] + for token in tokens: + if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace(): + grouped_tokens[-1] += token + else: + grouped_tokens.append(token) + + return grouped_tokens + +
+[docs] + def tokenize(self, text: str) -> List[str]: + tokens = self.__segment(text) + return tokens
+
+ + + +_tokenizers: Dict[int, LongestMatchTokenizer] = {} + + +
+[docs] +def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]: + """ + Dictionary-based longest matching word segmentation. + + :param str text: text to be tokenized into words + :param pythainlp.util.Trie custom_dict: dictionary for tokenization + :return: list of words, tokenized from the text + """ + if not text or not isinstance(text, str): + return [] + + if not custom_dict: + custom_dict = DEFAULT_WORD_DICT_TRIE + + global _tokenizers + custom_dict_ref_id = id(custom_dict) + if custom_dict_ref_id not in _tokenizers: + _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict) + + return _tokenizers[custom_dict_ref_id].tokenize(text)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/multi_cut.html b/5.1/_modules/pythainlp/tokenize/multi_cut.html new file mode 100644 index 0000000..2ca688e --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/multi_cut.html @@ -0,0 +1,315 @@ + + + + + + + + pythainlp.tokenize.multi_cut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.multi_cut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Multi cut -- Thai word segmentation with maximum matching.
+Original codes from Korakot Chaovavanich.
+
+:See Also:
+    * `Facebook post \
+        <https://www.facebook.com/groups/408004796247683/permalink/431283740586455/>`_
+    * `GitHub Gist \
+        <https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716>`_
+"""
+
+import re
+from collections import defaultdict
+from typing import Iterator, List
+
+from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
+from pythainlp.util import Trie
+
+
+
+[docs] +class LatticeString(str): + """String that keeps possible tokenizations""" + + def __new__(cls, value, multi=None, in_dict=True): + return str.__new__(cls, value) + +
+[docs] + def __init__(self, value, multi=None, in_dict=True): + self.unique = True + if multi: + self.multi = list(multi) + if len(self.multi) > 1: + self.unique = False + else: + self.multi = [value] + self.in_dict = in_dict # if in dictionary
+
+ + + +_RE_NONTHAI = r"""(?x) +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # numbers +[ \t]+| # spaces +\r?\n # newlines +""" +_PAT_NONTHAI = re.compile(_RE_NONTHAI) + + +def _multicut( + text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE +) -> Iterator[LatticeString]: + """Return LatticeString""" + if not custom_dict: + custom_dict = DEFAULT_WORD_DICT_TRIE + + len_text = len(text) + words_at = defaultdict(list) # main data structure + + def serialize(p, p2): # helper function + for w in words_at[p]: + p_ = p + len(w) + if p_ == p2: + yield w + elif p_ < p2: + for path in serialize(p_, p2): + yield w + "/" + path + + q = {0} + last_p = 0 # last position for yield + while min(q) < len_text: + p = min(q) + q -= {p} # q.pop, but for set + + for w in custom_dict.prefixes(text[p:]): + words_at[p].append(w) + q.add(p + len(w)) + + len_q = len(q) + + if len_q == 1: + q0 = min(q) + yield LatticeString(text[last_p:q0], serialize(last_p, q0)) + last_p = q0 + elif len_q == 0: # len(q) == 0 means not found in dictionary + m = _PAT_NONTHAI.match(text[p:]) + if m: # non-Thai token + i = p + m.span()[1] + else: # non-Thai token, find minimum skip + for i in range(p, len_text): + ww = custom_dict.prefixes(text[i:]) + m = _PAT_NONTHAI.match(text[i:]) + if ww or m: + break + else: + i = len_text + w = text[p:i] + words_at[p].append(w) + yield LatticeString(w, in_dict=False) + last_p = i + q.add(i) + + +
+[docs] +def mmcut(text: str) -> List[str]: + res = [] + for w in _multicut(text): + mm = min(w.multi, key=lambda x: x.count("/")) + res.extend(mm.split("/")) + return res
+ + + +def _combine(ww: List[LatticeString]) -> Iterator[str]: + if ww == []: + yield "" + else: + w = ww[0] + for tail in _combine(ww[1:]): + if w.unique: + yield w + "|" + tail + else: + for m in w.multi: + yield m.replace("/", "|") + "|" + tail + + +
+[docs] +def segment( + text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE +) -> List[str]: + """Dictionary-based maximum matching word segmentation. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segmented tokens + :rtype: List[str] + """ + if not text or not isinstance(text, str): + return [] + + return list(_multicut(text, custom_dict=custom_dict))
+ + + +
+[docs] +def find_all_segment( + text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE +) -> List[str]: + """Get all possible segment variations. + + :param text: input string to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :return: list of segment variations + :rtype: List[str] + """ + if not text or not isinstance(text, str): + return [] + + ww = list(_multicut(text, custom_dict=custom_dict)) + + return list(_combine(ww))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/nercut.html b/5.1/_modules/pythainlp/tokenize/nercut.html new file mode 100644 index 0000000..d22a2a8 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/nercut.html @@ -0,0 +1,220 @@ + + + + + + + + pythainlp.tokenize.nercut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.nercut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+nercut 0.2
+
+Dictionary-based maximal matching word segmentation, constrained by
+Thai Character Cluster (TCC) boundaries, and combining tokens that are
+parts of the same named entity.
+
+Code by Wannaphong Phatthiyaphaibun
+"""
+from typing import Iterable, List
+
+from pythainlp.tag.named_entity import NER
+
+_thainer = NER(engine="thainer")
+
+
+
+[docs] +def segment( + text: str, + taglist: Iterable[str] = [ + "ORGANIZATION", + "PERSON", + "PHONE", + "EMAIL", + "DATE", + "TIME", + ], + tagger=_thainer, +) -> List[str]: + """ + Dictionary-based maximal matching word segmentation, constrained by + Thai Character Cluster (TCC) boundaries, and combining tokens that are + parts of the same named-entity. + + :param str text: text to be tokenized into words + :param list taglist: a list of named entity tags to be used + :param class tagger: NER tagger engine + :return: list of words, tokenized from the text + """ + if not isinstance(text, str): + return [] + + tagged_words = tagger.tag(text, pos=False) + + words = [] + combining_word = "" + for idx, (curr_word, curr_tag) in enumerate(tagged_words): + if curr_tag != "O": + tag = curr_tag[2:] + else: + tag = "O" + + if curr_tag.startswith("B-") and tag in taglist: + combining_word = curr_word + elif ( + curr_tag.startswith("I-") + and combining_word != "" + and tag in taglist + ): + combining_word += curr_word + elif curr_tag == "O" and combining_word != "": + words.append(combining_word) + combining_word = "" + words.append(curr_word) + else: # if tag is O + combining_word = "" + words.append(curr_word) + if idx + 1 == len(tagged_words): + if curr_tag.startswith("B-") and combining_word != "": + words.append(combining_word) + elif curr_tag.startswith("I-") and combining_word != "": + words.append(combining_word) + else: + pass + + return words
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/newmm.html b/5.1/_modules/pythainlp/tokenize/newmm.html new file mode 100644 index 0000000..8df00ea --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/newmm.html @@ -0,0 +1,353 @@ + + + + + + + + pythainlp.tokenize.newmm — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.newmm

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Dictionary-based maximal matching word segmentation, constrained by
+Thai Character Cluster (TCC) boundaries with improved rules.
+
+The codes are based on the notebooks created by Korakot Chaovavanich,
+with heuristic graph size limit added to avoid exponential waiting time.
+
+:See Also:
+    * \
+        https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
+    * \
+        https://colab.research.google.com/drive/14Ibg-ngZXj15RKwjNwoZlOT32fQBOrBx#scrollTo=MYZ7NzAR7Dmw
+"""
+import re
+from collections import defaultdict
+from heapq import heappop, heappush
+from typing import Generator, List
+
+from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
+from pythainlp.tokenize.tcc_p import tcc_pos
+from pythainlp.util import Trie
+
+# match non-Thai tokens
+# `|` is used as like "early return",
+# which divides "abc123" to "abc", "123" for example.
+_PAT_NONTHAI = re.compile(
+    r"""(?x)
+[-a-zA-Z]+|        # Latin characters
+\d+([,\.]\d+)*|    # numbers
+[ \t]+|            # spaces
+\r?\n|             # newlines
+[^\u0E00-\u0E7F \t\r\n]+  # other non-Thai characters, and stops matching until space/newline
+"""
+)
+
+# match 2-consonant Thai tokens
+_PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
+
+
+# maximum graph size before cutoff
+_MAX_GRAPH_SIZE = 50
+
+# window size for safe mode
+_TEXT_SCAN_POINT = 120
+_TEXT_SCAN_LEFT = 20
+_TEXT_SCAN_RIGHT = 20
+_TEXT_SCAN_BEGIN = _TEXT_SCAN_POINT - _TEXT_SCAN_LEFT
+_TEXT_SCAN_END = _TEXT_SCAN_POINT + _TEXT_SCAN_RIGHT
+del _TEXT_SCAN_POINT
+del _TEXT_SCAN_LEFT
+del _TEXT_SCAN_RIGHT
+
+
+def _bfs_paths_graph(
+    graph: defaultdict, start: int, goal: int
+) -> Generator[List[int], None, None]:
+    queue = [(start, [start])]
+    while queue:
+        (vertex, path) = queue.pop(0)
+        for pos in graph[vertex]:
+            if pos == goal:
+                yield path + [pos]
+            else:
+                queue.append((pos, path + [pos]))
+
+
+def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]:
+    # main data structure:
+    # - key is beginning position (int)
+    # - value is possible ending positions (List[int])
+    # if key is not found, value is empty list
+    graph = defaultdict(list)
+
+    graph_size = 0  # keep track of graph size, if too big, force cutoff
+
+    valid_poss = tcc_pos(text)  # breaking positions that are TCC-valid
+
+    len_text = len(text)
+    pos_list = [0]  # priority queue of possible breaking positions
+    end_pos = 0
+    while pos_list[0] < len_text:
+        begin_pos = heappop(pos_list)
+        for word in custom_dict.prefixes(text[begin_pos:]):
+            end_pos_candidate = begin_pos + len(word)
+            if end_pos_candidate in valid_poss:
+                graph[begin_pos].append(end_pos_candidate)
+                graph_size = graph_size + 1
+
+                if end_pos_candidate not in pos_list:
+                    heappush(pos_list, end_pos_candidate)
+
+                if graph_size > _MAX_GRAPH_SIZE:
+                    break
+
+        len_pos_list = len(pos_list)
+        if len_pos_list == 1:  # one candidate, no longer ambiguous
+            end_pos_candidates = next(
+                _bfs_paths_graph(graph, end_pos, pos_list[0])
+            )
+            graph_size = 0
+            for pos in end_pos_candidates[1:]:
+                yield text[end_pos:pos]
+                end_pos = pos
+        elif len_pos_list == 0:  # no candidate, deal with non-dictionary word
+            m = _PAT_NONTHAI.match(text[begin_pos:])
+            if m:  # non-Thai token, skip to the end
+                end_pos = begin_pos + m.end()
+            else:  # Thai token, find minimum skip
+                for pos in range(begin_pos + 1, len_text):
+                    if pos in valid_poss:
+                        prefix = text[pos:]
+                        words = [
+                            word
+                            for word in custom_dict.prefixes(prefix)
+                            if (
+                                (pos + len(word) in valid_poss)
+                                and not _PAT_THAI_TWOCHARS.match(word)
+                            )
+                        ]
+                        if words:  # is a Thai token that longer than 2 chars
+                            end_pos = pos
+                            break
+
+                        # is a non-Thai token
+                        if _PAT_NONTHAI.match(prefix):
+                            end_pos = pos
+                            break
+                else:
+                    end_pos = len_text
+
+            graph[begin_pos].append(end_pos)
+            graph_size = graph_size + 1
+            yield text[begin_pos:end_pos]
+            heappush(pos_list, end_pos)
+
+
+
+[docs] +def segment( + text: str, + custom_dict: Trie = DEFAULT_WORD_DICT_TRIE, + safe_mode: bool = False, +) -> List[str]: + """Maximal-matching word segmentation constrained by Thai Character Cluster. + + A dictionary-based word segmentation using maximal matching algorithm, + constrained by Thai Character Cluster boundaries. + + A custom dictionary can be supplied. + + :param text: text to be tokenized + :type text: str + :param custom_dict: tokenization dictionary,\ + defaults to DEFAULT_WORD_DICT_TRIE + :type custom_dict: Trie, optional + :param safe_mode: reduce chance for long processing time for long text\ + with many ambiguous breaking points, defaults to False + :type safe_mode: bool, optional + :return: list of tokens + :rtype: List[str] + """ + if not text or not isinstance(text, str): + return [] + + if not custom_dict: + custom_dict = DEFAULT_WORD_DICT_TRIE + + if not safe_mode or len(text) < _TEXT_SCAN_END: + return list(_onecut(text, custom_dict)) + + # if the text is longer than the limit, + # break them into smaller chunks, then tokenize each chunk + text_parts = [] + while len(text) >= _TEXT_SCAN_END: + sample = text[_TEXT_SCAN_BEGIN:_TEXT_SCAN_END] + + # find possible breaking positions + cut_pos = _TEXT_SCAN_END + + # try to break by space first + space_idx = sample.rfind(" ") + if space_idx >= 0: + cut_pos = space_idx + 1 + _TEXT_SCAN_BEGIN + else: + tokens = list(_onecut(sample, custom_dict)) + token_max_idx = 0 + token_max_len = 0 + for i, token in enumerate(tokens): + if len(token) >= token_max_len: + token_max_len = len(token) + token_max_idx = i + + # choose the position that covers longest token + cut_pos = _TEXT_SCAN_BEGIN + for i in range(0, token_max_idx): + cut_pos = cut_pos + len(tokens[i]) + + text_parts.append(text[:cut_pos]) + text = text[cut_pos:] + + # append remaining text + if len(text): + text_parts.append(text) + + # tokenizes each text part + tokens = [] + for text_part in text_parts: + tokens.extend(list(_onecut(text_part, custom_dict))) + + return tokens
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/nlpo3.html b/5.1/_modules/pythainlp/tokenize/nlpo3.html new file mode 100644 index 0000000..433476e --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/nlpo3.html @@ -0,0 +1,216 @@ + + + + + + + + pythainlp.tokenize.nlpo3 — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.nlpo3

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from sys import stderr
+from typing import List
+
+from nlpo3 import load_dict as nlpo3_load_dict
+from nlpo3 import segment as nlpo3_segment
+
+from pythainlp.corpus import path_pythainlp_corpus
+from pythainlp.corpus.common import _THAI_WORDS_FILENAME
+
+_NLPO3_DEFAULT_DICT_NAME = "_73bcj049dzbu9t49b4va170k"  # supposed to be unique
+_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
+    path_pythainlp_corpus(_THAI_WORDS_FILENAME), _NLPO3_DEFAULT_DICT_NAME
+)  # preload default dict, so it can be accessible by _NLPO3_DEFAULT_DICT_NAME
+
+
+
+[docs] +def load_dict(file_path: str, dict_name: str) -> bool: + """Load a dictionary file into an in-memory dictionary collection. + + The loaded dictionary will be accessible through the assigned dict_name. + *** This function will not override an existing dict name. *** + + :param file_path: Path to a dictionary file + :type file_path: str + :param dict_name: A unique dictionary name, used for reference. + :type dict_name: str + :return success: True if loaded successfully, False otherwise. + :rtype: bool + + :See Also: + * \ + https://github.com/PyThaiNLP/nlpo3 + """ + msg, success = nlpo3_load_dict(file_path=file_path, dict_name=dict_name) + if not success: + print(msg, file=stderr) + return success
+ + + +
+[docs] +def segment( + text: str, + custom_dict: str = _NLPO3_DEFAULT_DICT_NAME, + safe_mode: bool = False, + parallel_mode: bool = False, +) -> List[str]: + """Break text into tokens. + + Python binding for nlpO3. It is newmm engine in Rust. + + :param str text: text to be tokenized + :param str custom_dict: dictionary name, as assigned with load_dict(),\ + defaults to pythainlp/corpus/common/words_th.txt + :param bool safe_mode: reduce chance for long processing time for long text\ + with many ambiguous breaking points, defaults to False + :param bool parallel_mode: Use multithread mode, defaults to False + + :return: list of tokens + :rtype: List[str] + + :See Also: + * \ + https://github.com/PyThaiNLP/nlpo3 + """ + return nlpo3_segment( + text=text, + dict_name=custom_dict, + safe=safe_mode, + parallel=parallel_mode, + )
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/oskut.html b/5.1/_modules/pythainlp/tokenize/oskut.html new file mode 100644 index 0000000..d8f3517 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/oskut.html @@ -0,0 +1,169 @@ + + + + + + + + pythainlp.tokenize.oskut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.oskut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper OSKut (Out-of-domain StacKed cut for Word Segmentation).
+Handling Cross- and Out-of-Domain Samples in Thai Word Segmentation
+Stacked Ensemble Framework and DeepCut as Baseline model (ACL 2021 Findings)
+
+:See Also:
+    * `GitHub repository <https://github.com/mrpeerat/OSKut>`_
+"""
+from typing import List
+
+import oskut
+
+DEFAULT_ENGINE = "ws"
+oskut.load_model(engine=DEFAULT_ENGINE)
+
+
+
+[docs] +def segment(text: str, engine: str = "ws") -> List[str]: + global DEFAULT_ENGINE + if not text or not isinstance(text, str): + return [] + if engine != DEFAULT_ENGINE: + DEFAULT_ENGINE = engine + oskut.load_model(engine=DEFAULT_ENGINE) + return oskut.OSKut(text)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/pyicu.html b/5.1/_modules/pythainlp/tokenize/pyicu.html new file mode 100644 index 0000000..c27e210 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/pyicu.html @@ -0,0 +1,180 @@ + + + + + + + + pythainlp.tokenize.pyicu — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.pyicu

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for PyICU word segmentation. This wrapper module uses
+:class:`icu.BreakIterator` with Thai as :class:`icu.Local`
+to locate boundaries between words in the text.
+
+:See Also:
+    * `GitHub repository <https://github.com/ovalhub/pyicu>`_
+"""
+import re
+from typing import List
+
+from icu import BreakIterator, Locale
+
+bd = BreakIterator.createWordInstance(Locale("th"))
+
+def _gen_words(text: str) -> str:
+    global bd
+    bd.setText(text)
+    p = bd.first()
+    for q in bd:
+        yield text[p:q]
+        p = q
+
+
+
+[docs] +def segment(text: str) -> List[str]: + """ + :param str text: text to be tokenized into words + :return: list of words, tokenized from the text + """ + if not text or not isinstance(text, str): + return [] + + text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text) + + return list(_gen_words(text))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/sefr_cut.html b/5.1/_modules/pythainlp/tokenize/sefr_cut.html new file mode 100644 index 0000000..08c08f9 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/sefr_cut.html @@ -0,0 +1,168 @@ + + + + + + + + pythainlp.tokenize.sefr_cut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.sefr_cut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for SEFR CUT Thai word segmentation. SEFR CUT is a
+Thai Word Segmentation Models using Stacked Ensemble.
+
+:See Also:
+    * `GitHub repository <https://github.com/mrpeerat/SEFR_CUT>`_
+"""
+from typing import List
+
+import sefr_cut
+
+DEFAULT_ENGINE = "ws1000"
+sefr_cut.load_model(engine=DEFAULT_ENGINE)
+
+
+
+[docs] +def segment(text: str, engine: str = "ws1000") -> List[str]: + global DEFAULT_ENGINE + if not text or not isinstance(text, str): + return [] + if engine != DEFAULT_ENGINE: + DEFAULT_ENGINE = engine + sefr_cut.load_model(engine=DEFAULT_ENGINE) + return sefr_cut.tokenize(text)[0]
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/tcc.html b/5.1/_modules/pythainlp/tokenize/tcc.html new file mode 100644 index 0000000..ccf2db2 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/tcc.html @@ -0,0 +1,263 @@ + + + + + + + + pythainlp.tokenize.tcc — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.tcc

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+The implementation of tokenizer according to Thai Character Clusters (TCCs)
+rules proposed by `Theeramunkong et al. 2000. \
+    <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548>`_
+
+Credits:
+    * TCC: Jakkrit TeCho
+    * Grammar: Wittawat Jitkrittum (`link to the source file \
+      <https://github.com/wittawatj/jtcc/blob/master/TCC.g>`_)
+    * Python code: Korakot Chaovavanich
+"""
+import re
+from typing import List, Set
+
+_RE_TCC = (
+    """\
+c[ั]([่-๋]c)?
+c[ั]([่-๋]c)?k
+เc็ck
+เcctาะk
+เccีtยะk
+เccีtย(?=[เ-ไก-ฮ]|$)k
+เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k
+เcc็ck
+เcิc์ck
+เcิtck
+เcีtยะ?k
+เcืtอะk
+เcื
+เctา?ะ?k
+c[ึื]tck
+c[ะ-ู]tk
+c[ิุู]์
+cรรc์
+c็
+ct[ะาำ]?k
+แc็ck
+แcc์k
+แctะk
+แcc็ck
+แccc์k
+โctะk
+[เ-ไ]ctk
+ก็
+อึ
+หึ
+""".replace(
+        "k", "(cc?[d|ิ]?[์])?"
+    )
+    .replace("c", "[ก-ฮ]")
+    .replace("t", "[่-๋]?")
+    .replace("d", "อูอุ".replace("อ", ""))  # DSara: lower vowel
+    .split()
+)
+
+_PAT_TCC = re.compile("|".join(_RE_TCC))
+
+
+
+[docs] +def tcc(text: str) -> str: + """ + TCC generator which generates Thai Character Clusters + + :param str text: text to be tokenized into character clusters + :return: subwords (character clusters) + :rtype: Iterator[str] + """ + if not text or not isinstance(text, str): + return "" + + len_text = len(text) + p = 0 + while p < len_text: + m = _PAT_TCC.match(text[p:]) + if m: + n = m.span()[1] + else: + n = 1 + yield text[p : p + n] + p += n
+ + + +
+[docs] +def tcc_pos(text: str) -> Set[int]: + """ + TCC positions + + :param str text: text to be tokenized into character clusters + :return: list of the ending position of subwords + :rtype: set[int] + """ + if not text or not isinstance(text, str): + return set() + + p_set = set() + p = 0 + for w in tcc(text): + p += len(w) + p_set.add(p) + + return p_set
+ + + +
+[docs] +def segment(text: str) -> List[str]: + """ + Subword segmentation + + :param str text: text to be tokenized into character clusters + :return: list of subwords (character clusters), tokenized from the text + :rtype: list[str] + + """ + + return list(tcc(text))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/tcc_p.html b/5.1/_modules/pythainlp/tokenize/tcc_p.html new file mode 100644 index 0000000..0701bba --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/tcc_p.html @@ -0,0 +1,263 @@ + + + + + + + + pythainlp.tokenize.tcc_p — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tokenize.tcc_p

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+The implementation of tokenizer according to Thai Character Clusters (TCCs)
+rules proposed by `Theeramunkong et al. 2000. \
+    <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548>`_
+and improved rules that are used in newmm
+
+Credits:
+    * TCC: Jakkrit TeCho
+    * Grammar: Wittawat Jitkrittum (`link to the source file \
+      <https://github.com/wittawatj/jtcc/blob/master/TCC.g>`_)
+    * Python code: Korakot Chaovavanich
+"""
+import re
+from typing import List, Set
+
+_RE_TCC = (
+    """\
+เc็ck
+เcctาะk
+เccีtยะk
+เccีtย(?=[เ-ไก-ฮ]|$)k
+เcc็ck
+เcิc์ck
+เcิtck
+เcีtยะ?k
+เcืtอะ?k
+เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k
+เctา?ะ?k
+cัtวะk
+c[ัื]tc[ุิะ]?k
+c[ิุู]์
+c[ะ-ู]tk
+cรรc์
+c็
+ct[ะาำ]?k
+ck
+แc็c
+แcc์
+แctะ
+แcc็c
+แccc์
+โctะ
+[เ-ไ]ct
+ก็
+อึ
+หึ
+""".replace(
+        "k", "(cc?[dิ]?[์])?"
+    )
+    .replace("c", "[ก-ฮ]")
+    .replace("t", "[่-๋]?")
+    .replace("d", "อูอุ".replace("อ", ""))  # DSara: lower vowel
+    .split()
+)
+
+_PAT_TCC = re.compile("|".join(_RE_TCC))
+
+
+
+[docs] +def tcc(text: str) -> str: + """ + TCC generator which generates Thai Character Clusters + + :param str text: text to be tokenized into character clusters + :return: subwords (character clusters) + :rtype: Iterator[str] + """ + if not text or not isinstance(text, str): + return "" + + len_text = len(text) + p = 0 + while p < len_text: + m = _PAT_TCC.match(text[p:]) + if m: + n = m.span()[1] + else: + n = 1 + yield text[p : p + n] + p += n
+ + + +
+[docs] +def tcc_pos(text: str) -> Set[int]: + """ + TCC positions + + :param str text: text to be tokenized into character clusters + :return: list of the ending position of subwords + :rtype: set[int] + """ + if not text or not isinstance(text, str): + return set() + + p_set = set() + p = 0 + for w in tcc(text): + p += len(w) + p_set.add(p) + + return p_set
+ + + +
+[docs] +def segment(text: str) -> List[str]: + """ + Subword segmentation + + :param str text: text to be tokenized into character clusters + :return: list of subwords (character clusters), tokenized from the text + :rtype: list[str] + + """ + + return list(tcc(text))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tokenize/thaisumcut.html b/5.1/_modules/pythainlp/tokenize/thaisumcut.html new file mode 100644 index 0000000..8b0f168 --- /dev/null +++ b/5.1/_modules/pythainlp/tokenize/thaisumcut.html @@ -0,0 +1,529 @@ + + + + + + + + pythainlp.tokenize.thaisumcut — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.tokenize.thaisumcut

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien
+# SPDX-License-Identifier: Apache-2.0
+"""
+The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020
+original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum
+
+Cite:
+
+@mastersthesis{chumpolsathien_2020,
+    title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization},
+    author={Chumpolsathien, Nakhun},
+    year={2020},
+    school={Beijing Institute of Technology}
+"""
+
+import math
+import operator
+import re
+from typing import List
+
+from pythainlp.tokenize import word_tokenize
+
+
+
+[docs] +def list_to_string(list: List[str]) -> str: + string = "".join(list) + string = " ".join(string.split()) + return string
+ + + +
+[docs] +def middle_cut(sentences: List[str]) -> List[str]: + new_text = "" + for sentence in sentences: + sentence_size = len(word_tokenize(sentence, keep_whitespace=False)) + + for k in range(0, len(sentence)): + if k == 0 or k + 1 >= len(sentence): + continue + if sentence[k].isdigit() and sentence[k - 1] == " ": + sentence = sentence[: k - 1] + sentence[k:] + if k + 2 <= len(sentence): + if sentence[k].isdigit() and sentence[k + 1] == " ": + sentence = sentence[: k + 1] + sentence[k + 2 :] + + fixed_text_lenth = 20 + + if sentence_size > fixed_text_lenth: + partition = math.floor(sentence_size / fixed_text_lenth) + tokens = word_tokenize(sentence, keep_whitespace=True) + for i in range(0, partition): + middle_space = sentence_size / (partition + 1) * (i + 1) + white_space_index = [] + white_space_diff = {} + + for j in range(len(tokens)): + if tokens[j] == " ": + white_space_index.append(j) + + for white_space in white_space_index: + white_space_diff.update( + {white_space: abs(white_space - middle_space)} + ) + + if len(white_space_diff) > 0: + min_diff = min( + white_space_diff.items(), key=operator.itemgetter(1) + ) + tokens.pop(min_diff[0]) + tokens.insert(min_diff[0], "<stop>") + new_text = new_text + list_to_string(tokens) + "<stop>" + else: + new_text = new_text + sentence + "<stop>" + + sentences = new_text.split("<stop>") + sentences = [s.strip() for s in sentences] + if "" in sentences: + sentences.remove("") + if "nan" in sentences: + sentences.remove("nan") + + sentences = list(filter(None, sentences)) + return sentences
+ + + +
+[docs] +class ThaiSentenceSegmentor: +
+[docs] + def split_into_sentences( + self, text: str, isMiddleCut: bool = False + ) -> List[str]: + # Declare Variables + th_alphabets = "([ก-๙])" + th_conjunction = "(ทำให้|โดย|เพราะ|นอกจากนี้|แต่|กรณีที่|หลังจากนี้|ต่อมา|ภายหลัง|นับตั้งแต่|หลังจาก|ซึ่งเหตุการณ์|ผู้สื่อข่าวรายงานอีก|ส่วนที่|ส่วนสาเหตุ|ฉะนั้น|เพราะฉะนั้น|เพื่อ|เนื่องจาก|จากการสอบสวนทราบว่า|จากกรณี|จากนี้|อย่างไรก็ดี)" + th_cite = "(กล่าวว่า|เปิดเผยว่า|รายงานว่า|ให้การว่า|เผยว่า|บนทวิตเตอร์ว่า|แจ้งว่า|พลเมืองดีว่า|อ้างว่า)" + th_ka_krub = "(ครับ|ค่ะ)" + th_stop_after = "(หรือไม่|โดยเร็ว|แล้ว|อีกด้วย)" + th_stop_before = "(ล่าสุด|เบื้องต้น|ซึ่ง|ทั้งนี้|แม้ว่า|เมื่อ|แถมยัง|ตอนนั้น|จนเป็นเหตุให้|จากนั้น|อย่างไรก็ตาม|และก็|อย่างใดก็ตาม|เวลานี้|เช่น|กระทั่ง)" + degit = "([0-9])" + th_title = "(นาย|นาง|นางสาว|เด็กชาย|เด็กหญิง|น.ส.|ด.ช.|ด.ญ.)" + + text = f" {text} " + text = text.replace("\n", " ") + text = text.replace("", "") + text = text.replace("โดยเร็ว", "<rth_Doeirew>") + text = text.replace("เพื่อน", "<rth_friend>") + text = text.replace("แต่ง", "<rth_but>") + text = text.replace("โดยสาร", "<rth_passenger>") + text = text.replace("แล้วแต่", "<rth_leawtea>") + text = text.replace("หรือเปล่า", "<rth_repraw>") + text = text.replace("หรือไม่", "<rth_remai>") + text = text.replace("จึงรุ่งเรืองกิจ", "<rth_tanatorn_lastname>") + text = text.replace("ตั้งแต่", "<rth_tangtea>") + text = text.replace("แต่ละ", "<rth_teala>") + text = text.replace("วิตแล้ว", "<rth_chiwitleaw>") + text = text.replace("โดยประ", "<rth_doipra>") + text = text.replace("แต่หลังจากนั้น", "<rth_tealangjaknan>") + text = text.replace("พรรคเพื่อ", "<for_party>") + text = text.replace("แต่เนื่อง", "<rth_teaneung>") + text = text.replace("เพื่อทำให้", "เพื่อ<rth_tamhai>") + text = text.replace("ทำเพื่อ", "ทำ<rth_for>") + text = text.replace("จึงทำให้", "จึง<tamhai>") + text = text.replace("มาโดยตลอด", "<madoitalod>") + text = text.replace("แต่อย่างใด", "<teayangdaikptam>") + text = text.replace("แต่หลังจาก", "แต่<langjak>") + text = text.replace("คงทำให้", "<rth_kongtamhai>") + text = text.replace("แต่ทั้งนี้", "แต่<tangni>") + text = text.replace("มีแต่", "มี<tea>") + text = text.replace("เหตุที่ทำให้", "<hedteetamhai>") + text = text.replace("โดยหลังจาก", "โดย<langjak>") + text = text.replace("ซึ่งหลังจาก", "ซึ่ง<langjak>") + text = text.replace("ตั้งโดย", "<rth_tangdoi>") + text = text.replace("โดยตรง", "<rth_doitong>") + text = text.replace("นั้นหรือ", "<rth_nanhlor>") + text = text.replace("ซึ่งต้องทำให้", "ซึ่งต้อง<tamhai>") + text = text.replace("ชื่อต่อมา", "ชื่อ<tomar>") + text = text.replace("โดยเร่งด่วน", "<doi>เร่งด่วน") + text = text.replace("ไม่ได้ทำให้", "ไม่ได้<tamhai>") + text = text.replace("จะทำให้", "จะ<tamhai>") + text = text.replace("จนทำให้", "จน<tamhai>") + text = text.replace("เว้นแต่", "เว้น<rth_tea>") + text = text.replace("ก็ทำให้", "ก็<tamhai>") + text = text.replace(" ณ ตอนนั้น", " ณ <tonnan>") + text = text.replace("บางส่วน", "บาง<rth_suan>") + text = text.replace("หรือแม้แต่", "หรือ<rth_meatea>") + text = text.replace("โดยทำให้", "โดย<tamhai>") + text = text.replace("หรือเพราะ", "หรือ<rth_orbecause>") + text = text.replace("มาแต่", "มา<rth_tea>") + text = text.replace("แต่ไม่ทำให้", "แต่<maitamhai>") + text = text.replace("ฉะนั้นเมื่อ", "ฉะนั้น<rth_moe>") + text = text.replace("เพราะฉะนั้น", "เพราะ<rth_chanan>") + text = text.replace("เพราะหลังจาก", "เพราะ<rth_langjak>") + text = text.replace("สามารถทำให้", "สามารถ<rth_tamhai>") + text = text.replace("อาจทำ", "อาจ<rth_tam>") + text = text.replace("จะทำ", "จะ<rth_tam>") + text = text.replace("และนอกจากนี้", "นอกจากนี้") + text = text.replace("อีกทั้งเพื่อ", "อีกทั้ง<rth_for>") + text = text.replace("ทั้งนี้เพื่อ", "ทั้งนี้<rth_for>") + text = text.replace("เวลาต่อมา", "เวลา<rth_toma>") + text = text.replace("อย่างไรก็ตาม", "อย่างไรก็ตาม") + text = text.replace( + "อย่างไรก็ตามหลังจาก", "<stop>อย่างไรก็ตาม<rth_langjak>" + ) + text = text.replace("ซึ่งทำให้", "ซึ่ง<rth_tamhai>") + text = text.replace("โดยประมาท", "<doi>ประมาท") + text = text.replace("โดยธรรม", "<doi>ธรรม") + text = text.replace("โดยสัจจริง", "<doi>สัจจริง") + + if "และ" in text: + tokens = word_tokenize(text.strip(), keep_whitespace=True) + and_position = -1 + nearest_space_position = -1 + last_position = len(tokens) + pop_split_position = [] + split_position = [] + for i in range(len(tokens)): + if tokens[i] == "และ": + and_position = i + + if ( + and_position != -1 + and i > and_position + and tokens[i] == " " + and nearest_space_position == -1 + ): + if i - and_position != 1: + nearest_space_position = i + + if and_position != -1 and last_position - and_position == 3: + split_position.append(last_position) + and_position = -1 + nearest_space_position = -1 + + if nearest_space_position != -1: + if nearest_space_position - and_position < 5: + pop_split_position.append(nearest_space_position) + else: + split_position.append(and_position) + and_position = -1 + nearest_space_position = -1 + for pop in pop_split_position: + tokens.pop(pop) + tokens.insert(pop, "<stop>") + for split in split_position: + tokens.insert(split, "<stop>") + text = list_to_string(tokens) + + if "หรือ" in text: + tokens = word_tokenize(text.strip(), keep_whitespace=True) + or_position = -1 + nearest_space_position = -1 + last_position = len(tokens) + pop_split_position = [] + split_position = [] + for i in range(len(tokens)): + if tokens[i] == "หรือ": + or_position = i + if ( + or_position != -1 + and i > or_position + and tokens[i] == " " + and nearest_space_position == -1 + ): + if i - or_position != 1: + nearest_space_position = i + + if or_position != -1 and last_position - or_position == 3: + split_position.append(last_position) + or_position = -1 + nearest_space_position = -1 + + if nearest_space_position != -1: + if nearest_space_position - or_position < 4: + pop_split_position.append(nearest_space_position) + else: + split_position.append(or_position) + or_position = -1 + nearest_space_position = -1 + for pop in pop_split_position: + tokens.pop(pop) + tokens.insert(pop, "<stop>") + for split in split_position: + tokens.insert(split, "<stop>") + text = list_to_string(tokens) + + if "จึง" in text: + tokens = word_tokenize(text.strip(), keep_whitespace=True) + cung_position = -1 + nearest_space_position = -1 + pop_split_position = [] + last_position = len(tokens) + split_position = [] + for i in range(len(tokens)): + if tokens[i] == "จึง": + cung_position = i + + if ( + cung_position != -1 + and tokens[i] == " " + and i > cung_position + and nearest_space_position == -1 + ): + if i - cung_position != 1: + nearest_space_position = i + + if cung_position != -1 and last_position - cung_position == 2: + split_position.append(last_position) + cung_position = -1 + nearest_space_position = -1 + + if nearest_space_position != -1: + if nearest_space_position - cung_position < 3: + pop_split_position.append(nearest_space_position) + else: + split_position.append(cung_position) + cung_position = -1 + nearest_space_position = -1 + + for pop in pop_split_position: + tokens.pop(pop) + tokens.insert(pop, "<stop>") + for split in split_position: + tokens.insert(split, "<stop>") + + text = list_to_string(tokens) + + text = re.sub(" " + th_stop_before, "<stop>\\1", text) + text = re.sub(th_ka_krub, "\\1<stop>", text) + text = re.sub(th_conjunction, "<stop>\\1", text) + text = re.sub(th_cite, "\\1<stop>", text) + text = re.sub(" " + degit + "[.]" + th_title, "<stop>\\1.\\2", text) + text = re.sub( + " " + degit + degit + "[.]" + th_title, "<stop>\\1\\2.\\3", text + ) + text = re.sub(th_alphabets + th_stop_after + " ", "\\1\\2<stop>", text) + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace("<rth_Doeirew>", "โดยเร็ว") + text = text.replace("<rth_friend>", "เพื่อน") + text = text.replace("<rth_but>", "แต่ง") + text = text.replace("<rth_passenger>", "โดยสาร") + text = text.replace("<rth_leawtea>", "แล้วแต่") + text = text.replace("<rth_repraw>", "หรือเปล่า") + text = text.replace("<rth_remai>", "หรือไม่") + text = text.replace("<rth_tanatorn_lastname>", "จึงรุ่งเรืองกิจ") + text = text.replace("<rth_tangtea>", "ตั้งแต่") + text = text.replace("<rth_teala>", "แต่ละ") + text = text.replace("<rth_chiwitleaw>", "วิตแล้ว") + text = text.replace("<rth_doipra>", "โดยประ") + text = text.replace("<rth_tealangjaknan>", "แต่หลังจากนั้น") + text = text.replace("<for_party>", "พรรคเพื่อ") + text = text.replace("<rth_teaneung>", "แต่เนื่อง") + text = text.replace("เพื่อ<rth_tamhai>", "เพื่อทำให้") + text = text.replace("ทำ<rth_for>", "ทำเพื่อ") + text = text.replace("จึง<tamhai>", "จึงทำให้") + text = text.replace("<madoitalod>", "มาโดยตลอด") + text = text.replace("แต่<langjak>", "แต่หลังจาก") + text = text.replace("แต่<tangni>", "แต่ทั้งนี้") + text = text.replace("มี<tea>", "มีแต่") + text = text.replace("<teayangdaikptam>", "แต่อย่างใด") + text = text.replace("<rth_kongtamhai>", "คงทำให้") + text = text.replace("<hedteetamhai>", "เหตุที่ทำให้") + text = text.replace("โดย<langjak>", "โดยหลังจาก") + text = text.replace("ซึ่ง<langjak>", "ซึ่งหลังจาก") + text = text.replace("<rth_tangdoi>", "ตั้งโดย") + text = text.replace("<rth_doitong>", "โดยตรง") + text = text.replace("<rth_nanhlor>", "นั้นหรือ") + text = text.replace("ซึ่งต้อง<tamhai>", "ซึ่งต้องทำให้") + text = text.replace("ชื่อ<tomar>", "ชื่อต่อมา") + text = text.replace("<doi>เร่งด่วน", "โดยเร่งด่วน") + text = text.replace("ไม่ได้<tamhai>", "ไม่ได้ทำให้") + text = text.replace("จะ<tamhai>", "จะทำให้") + text = text.replace("จน<tamhai>", "จนทำให้") + text = text.replace("เว้น<rth_tea>", "เว้นแต่") + text = text.replace("ก็<tamhai>", "ก็ทำให้") + text = text.replace(" ณ <tonnan>", " ณ ตอนนั้น") + text = text.replace("บาง<rth_suan>", "บางส่วน") + text = text.replace("หรือ<rth_meatea>", "หรือแม้แต่") + text = text.replace("โดย<tamhai>", "โดยทำให้") + text = text.replace("หรือ<rth_orbecause>", "หรือเพราะ") + text = text.replace("มา<rth_tea>", "มาแต่") + text = text.replace("แต่<maitamhai>", "แต่ไม่ทำให้") + text = text.replace("ฉะนั้น<rth_moe>", "ฉะนั้นเมื่อ") + text = text.replace("เพราะ<rth_chanan>", "เพราะฉะนั้น") + text = text.replace("เพราะ<rth_langjak>", "เพราะหลังจาก") + text = text.replace("สามารถ<rth_tamhai>", "สามารถทำให้") + text = text.replace("อาจ<rth_tam>", "อาจทำ") + text = text.replace("จะ<rth_tam>", "จะทำ") + text = text.replace("อีกทั้ง<rth_for>", "อีกทั้งเพื่อ") + text = text.replace("ทั้งนี้<rth_for>", "ทั้งนี้เพื่อ") + text = text.replace("เวลา<rth_toma>", "เวลาต่อมา") + text = text.replace( + "อย่างไรก็ตาม<rth_langjak>", + "อย่างไรก็ตามหลังจาก", + ) + text = text.replace("ซึ่ง<rth_tamhai>", "ซึ่งทำให้") + text = text.replace("<doi>ประมาท", "โดยประมาท") + text = text.replace("<doi>ธรรม", "โดยธรรม") + text = text.replace("<doi>สัจจริง", "โดยสัจจริง") + text = text.replace("?", "?<stop>") + text = text.replace("!", "!<stop>") + text = text.replace("<prd>", ".") + sentences = text.split("<stop>") + sentences = [s.strip() for s in sentences] + if "" in sentences: + sentences.remove("") + if "nan" in sentences: + sentences.remove("nan") + + sentences = list(filter(None, sentences)) + + if isMiddleCut: + return middle_cut(sentences) + else: + return sentences
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tools/misspell.html b/5.1/_modules/pythainlp/tools/misspell.html new file mode 100644 index 0000000..cd74040 --- /dev/null +++ b/5.1/_modules/pythainlp/tools/misspell.html @@ -0,0 +1,286 @@ + + + + + + + + pythainlp.tools.misspell — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tools.misspell

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List
+
+import numpy as np
+
+THAI_CHARACTERS_WITHOUT_SHIFT = [
+    "ผปแอิืทมใฝ",
+    "ฟหกดเ้่าสวง",
+    "ๆไำพะัีรนยบลฃ",
+    "ๅ/_ภถุึคตจขช",
+]
+
+THAI_CHARACTERS_WITH_SHIFT = [
+    "()ฉฮฺ์?ฒฬฦ",
+    "ฤฆฏโฌ็๋ษศซ.",
+    '๐"ฎฑธํ๊ณฯญฐ,',
+    "+๑๒๓๔ู฿๕๖๗๘๙",
+]
+
+ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
+    "1234567890-=",
+    "qwertyuiop[]\\",
+    "asdfghjkl;'",
+    "zxcvbnm,./",
+]
+
+ENGLISH_CHARACTERS_WITH_SHIFT = [
+    "!@#$%^&*()_+",
+    "QWERTYUIOP{}|",
+    'ASDFGHJKL:"',
+    "ZXCVBNM<>?",
+]
+
+
+ALL_CHARACTERS = [
+    THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
+    ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
+]
+
+
+def search_location_of_character(char: str):
+    for language_ix in [0, 1]:
+        for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
+            if char in row:
+                return (language_ix, ix // 4, ix % 4, row.index(char))
+
+
+def find_neighbour_locations(
+    loc: tuple,
+    char: str,
+    kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
+):
+    language_ix, is_shift, row, pos = loc
+
+    valid_neighbours = []
+    for kr, ks in kernel:
+        _row, _pos = row + kr, pos + ks
+        if 0 <= _row <= 3 and 0 <= _pos <= len(
+            ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
+        ):
+            valid_neighbours.append((language_ix, is_shift, _row, _pos, char))
+
+    return valid_neighbours
+
+
+def find_misspell_candidates(char: str, verbose: bool = False):
+    loc = search_location_of_character(char)
+    if loc is None:
+        return None
+
+    valid_neighbours = find_neighbour_locations(loc, char)
+
+    chars = []
+    printing_locations = ["▐"] * 3 + [char] + ["▐"] * 3
+
+    for language_ix, is_shift, row, pos, char in valid_neighbours:
+        try:
+            char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
+            chars.append(char)
+            kernel = (row - loc[1], pos - loc[2])
+
+            if kernel == (-1, -1):
+                ix = 5
+            elif kernel == (-1, 0):
+                ix = 6
+            elif kernel[0] == 0:
+                ix = 3 + kernel[1]
+            elif kernel == (1, 0):
+                ix = 0
+            elif kernel == (1, 1):
+                ix = 1
+            else:
+                continue
+            printing_locations[ix] = char
+        except IndexError:
+            continue
+        except Exception as e:
+            print("Something wrong with: ", char)
+            raise e
+
+    return chars
+
+
+
+[docs] +def misspell(sentence: str, ratio: float = 0.05): + """ + Simulate some misspellings of the input sentence. + The number of misspelled locations is governed by ratio. + + :params str sentence: sentence to be misspelled + :params float ratio: number of misspells per 100 chars. Defaults to 0.5. + + :return: sentence containing some misspelled words + :rtype: str + + :Example: + :: + + from pythainlp.tools.misspell import misspell + + sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826" + + misspell(sent, ratio=0.1) + # output: + ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 + """ + num_misspells = np.floor(len(sentence) * ratio).astype(int) + positions = np.random.choice( + len(sentence), size=num_misspells, replace=False + ) + + # convert strings to array of characters + misspelled = list(sentence) + for pos in positions: + potential_candidates = find_misspell_candidates(sentence[pos]) + if potential_candidates is None: + continue + + candidate = np.random.choice(potential_candidates) + + misspelled[pos] = candidate + + return "".join(misspelled)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/tools/path.html b/5.1/_modules/pythainlp/tools/path.html new file mode 100644 index 0000000..bd48319 --- /dev/null +++ b/5.1/_modules/pythainlp/tools/path.html @@ -0,0 +1,225 @@ + + + + + + + + pythainlp.tools.path — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.tools.path

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+PyThaiNLP data tools
+
+For text processing and text conversion, see pythainlp.util
+"""
+import os
+
+from pythainlp import __file__ as pythainlp_file
+
+PYTHAINLP_DEFAULT_DATA_DIR = "pythainlp-data"
+
+
+
+[docs] +def get_full_data_path(path: str) -> str: + """ + This function joins path of :mod:`pythainlp` data directory and the + given path, and returns the full path. + + :return: full path given the name of dataset + :rtype: str + + :Example: + :: + + from pythainlp.tools import get_full_data_path + + get_full_data_path('ttc_freq.txt') + # output: '/root/pythainlp-data/ttc_freq.txt' + """ + return os.path.join(get_pythainlp_data_path(), path)
+ + + +
+[docs] +def get_pythainlp_data_path() -> str: + """ + Returns the full path where PyThaiNLP keeps its (downloaded) data. + If the directory does not yet exist, it will be created. + The path can be specified through the environment variable + :envvar:`PYTHAINLP_DATA_DIR`. By default, `~/pythainlp-data` + will be used. + + :return: full path of directory for :mod:`pythainlp` downloaded data + :rtype: str + + :Example: + :: + + from pythainlp.tools import get_pythainlp_data_path + + get_pythainlp_data_path() + # output: '/root/pythainlp-data' + """ + pythainlp_data_dir = os.getenv( + "PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DEFAULT_DATA_DIR) + ) + path = os.path.expanduser(pythainlp_data_dir) + os.makedirs(path, exist_ok=True) + return path
+ + + +
+[docs] +def get_pythainlp_path() -> str: + """ + This function returns full path of PyThaiNLP codes + + :return: full path of :mod:`pythainlp` codes + :rtype: str + + :Example: + :: + + from pythainlp.tools import get_pythainlp_path + + get_pythainlp_path() + # output: '/usr/local/lib/python3.6/dist-packages/pythainlp' + """ + return os.path.dirname(pythainlp_file)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/translate/core.html b/5.1/_modules/pythainlp/translate/core.html new file mode 100644 index 0000000..cdb63ac --- /dev/null +++ b/5.1/_modules/pythainlp/translate/core.html @@ -0,0 +1,246 @@ + + + + + + + + pythainlp.translate.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.translate.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+
+[docs] +class Translate: + """ + Machine Translation + """ + +
+[docs] + def __init__( + self, + src_lang: str, + target_lang: str, + engine: str = "default", + use_gpu: bool = False, + ) -> None: + """ + :param str src_lang: source language + :param str target_lang: target language + :param str engine: machine translation engine + :param bool use_gpu: load model using GPU (Default is False) + + **Options for engine* + * *default* - The default engine for each language. + * *small100* - A multilingual machine translation model (covering 100 languages) + + **Options for source & target language** + * *th* - *en* - Thai to English + * *en* - *th* - English to Thai + * *th* - *zh* - Thai to Chinese + * *zh* - *th* - Chinese to Thai + * *th* - *fr* - Thai to French + * *th* - *xx* - Thai to xx (xx is language code). It uses small100 model. + * *xx* - *th* - xx to Thai (xx is language code). It uses small100 model. + + :Example: + + Translate text from Thai to English:: + + from pythainlp.translate import Translate + + th2en = Translate("th", "en") + + th2en.translate("ฉันรักแมว") + # output: I love cat. + """ + self.model = None + self.engine = engine + self.src_lang = src_lang + self.use_gpu = use_gpu + self.target_lang = target_lang + self.load_model()
+ + +
+[docs] + def load_model(self): + src_lang = self.src_lang + target_lang = self.target_lang + use_gpu = self.use_gpu + if self.engine == "small100": + from .small100 import Small100Translator + + self.model = Small100Translator(use_gpu) + elif src_lang == "th" and target_lang == "en": + from pythainlp.translate.en_th import ThEnTranslator + + self.model = ThEnTranslator(use_gpu) + elif src_lang == "en" and target_lang == "th": + from pythainlp.translate.en_th import EnThTranslator + + self.model = EnThTranslator(use_gpu) + elif src_lang == "th" and target_lang == "zh": + from pythainlp.translate.zh_th import ThZhTranslator + + self.model = ThZhTranslator(use_gpu) + elif src_lang == "zh" and target_lang == "th": + from pythainlp.translate.zh_th import ZhThTranslator + + self.model = ZhThTranslator(use_gpu) + elif src_lang == "th" and target_lang == "fr": + from pythainlp.translate.th_fr import ThFrTranslator + + self.model = ThFrTranslator(use_gpu) + else: + raise ValueError("Not support language!")
+ + +
+[docs] + def translate(self, text) -> str: + """ + Translate text + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + """ + if self.engine == "small100": + return self.model.translate(text, tgt_lang=self.target_lang) + return self.model.translate(text)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/translate/en_th.html b/5.1/_modules/pythainlp/translate/en_th.html new file mode 100644 index 0000000..8970046 --- /dev/null +++ b/5.1/_modules/pythainlp/translate/en_th.html @@ -0,0 +1,320 @@ + + + + + + + + pythainlp.translate.en_th — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.translate.en_th

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+English-Thai Machine Translation
+
+from VISTEC-depa Thailand Artificial Intelligence Research Institute
+
+Website: https://airesearch.in.th/releases/machine-translation-models/
+"""
+import os
+
+from fairseq.models.transformer import TransformerModel
+from sacremoses import MosesTokenizer
+
+from pythainlp.corpus import download, get_corpus_path
+
+_EN_TH_MODEL_NAME = "scb_1m_en-th_moses"
+# SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz
+_EN_TH_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0"
+
+_TH_EN_MODEL_NAME = "scb_1m_th-en_spm"
+# SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz
+_TH_EN_FILE_NAME = "SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0"
+
+
+def _get_translate_path(model: str, *path: str) -> str:
+    return os.path.join(get_corpus_path(model, version="1.0"), *path)
+
+
+def _download_install(name: str) -> None:
+    if get_corpus_path(name) is None:
+        download(name, force=True, version="1.0")
+
+
+
+[docs] +def download_model_all() -> None: + """ + Download all translation models in advance + """ + _download_install(_EN_TH_MODEL_NAME) + _download_install(_TH_EN_MODEL_NAME)
+ + + +
+[docs] +class EnThTranslator: + """ + English-Thai Machine Translation + + from VISTEC-depa Thailand Artificial Intelligence Research Institute + + Website: https://airesearch.in.th/releases/machine-translation-models/ + + :param bool use_gpu : load model using GPU (Default is False) + """ + +
+[docs] + def __init__(self, use_gpu: bool = False): + self._tokenizer = MosesTokenizer("en") + + self._model_name = _EN_TH_MODEL_NAME + + _download_install(self._model_name) + self._model = TransformerModel.from_pretrained( + model_name_or_path=_get_translate_path( + self._model_name, + _EN_TH_FILE_NAME, + "models", + ), + checkpoint_file="checkpoint.pt", + data_name_or_path=_get_translate_path( + self._model_name, + _EN_TH_FILE_NAME, + "vocab", + ), + ) + if use_gpu: + self._model = self._model.cuda()
+ + +
+[docs] + def translate(self, text: str) -> str: + """ + Translate text from English to Thai + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from English to Thai:: + + from pythainlp.translate import EnThTranslator + + enth = EnThTranslator() + + enth.translate("I love cat.") + # output: ฉันรักแมว + + """ + tokens = " ".join(self._tokenizer.tokenize(text)) + translated = self._model.translate(tokens) + return translated.replace(" ", "").replace("▁", " ").strip()
+
+ + + +
+[docs] +class ThEnTranslator: + """ + Thai-English Machine Translation + + from VISTEC-depa Thailand Artificial Intelligence Research Institute + + Website: https://airesearch.in.th/releases/machine-translation-models/ + + :param bool use_gpu : load model using GPU (Default is False) + """ + +
+[docs] + def __init__(self, use_gpu: bool = False): + self._model_name = _TH_EN_MODEL_NAME + + _download_install(self._model_name) + self._model = TransformerModel.from_pretrained( + model_name_or_path=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "models", + ), + checkpoint_file="checkpoint.pt", + data_name_or_path=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "vocab", + ), + bpe="sentencepiece", + sentencepiece_model=_get_translate_path( + self._model_name, + _TH_EN_FILE_NAME, + "bpe", + "spm.th.model", + ), + ) + if use_gpu: + self._model.cuda()
+ + +
+[docs] + def translate(self, text: str) -> str: + """ + Translate text from Thai to English + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Thai to English:: + + from pythainlp.translate import ThEnTranslator + + then = ThEnTranslator() + + then.translate("ฉันรักแมว") + # output: I love cat. + + """ + return self._model.translate(text)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/translate/th_fr.html b/5.1/_modules/pythainlp/translate/th_fr.html new file mode 100644 index 0000000..a641fcb --- /dev/null +++ b/5.1/_modules/pythainlp/translate/th_fr.html @@ -0,0 +1,217 @@ + + + + + + + + pythainlp.translate.th_fr — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.translate.th_fr

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai-French Machine Translation
+
+Trained by OPUS Corpus
+
+Model is from Language Technology Research Group at the University of Helsinki
+
+BLEU 20.4
+
+- Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr
+"""
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+
+[docs] +class ThFrTranslator: + """ + Thai-French Machine Translation + + Trained by OPUS Corpus + + Model is from Language Technology Research Group at the University of Helsinki + + BLEU 20.4 + + - Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr + + :param bool use_gpu : load model using GPU (Default is False) + """ + +
+[docs] + def __init__( + self, + use_gpu: bool = False, + pretrained: str = "Helsinki-NLP/opus-mt-th-fr", + ) -> None: + self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained) + self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + if use_gpu: + self.model_thzh = self.model_thzh.cuda()
+ + +
+[docs] + def translate(self, text: str) -> str: + """ + Translate text from Thai to French + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Thai to French:: + + from pythainlp.translate.th_fr import ThFrTranslator + + thfr = ThFrTranslator() + + thfr.translate("ทดสอบระบบ") + # output: "Test du système." + + """ + self.translated = self.model_thzh.generate( + **self.tokenizer_thzh(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer_thzh.decode(t, skip_special_tokens=True) + for t in self.translated + ][0]
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/translate/zh_th.html b/5.1/_modules/pythainlp/translate/zh_th.html new file mode 100644 index 0000000..00e5931 --- /dev/null +++ b/5.1/_modules/pythainlp/translate/zh_th.html @@ -0,0 +1,271 @@ + + + + + + + + pythainlp.translate.zh_th — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.translate.zh_th

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Lalita Chinese-Thai Machine Translation
+
+from AI builder
+
+- GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth
+- Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822
+"""
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+
+[docs] +class ThZhTranslator: + """ + Thai-Chinese Machine Translation + + from Lalita @ AI builder + + - GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth + - Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 + + :param bool use_gpu : load model using GPU (Default is False) + """ + +
+[docs] + def __init__( + self, + use_gpu: bool = False, + pretrained: str = "Lalita/marianmt-th-zh_cn", + ) -> None: + self.tokenizer_thzh = AutoTokenizer.from_pretrained(pretrained) + self.model_thzh = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + if use_gpu: + self.model_thzh = self.model_thzh.cuda()
+ + +
+[docs] + def translate(self, text: str) -> str: + """ + Translate text from Thai to Chinese + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Thai to Chinese:: + + from pythainlp.translate import ThZhTranslator + + thzh = ThZhTranslator() + + thzh.translate("ผมรักคุณ") + # output: 我爱你 + + """ + self.translated = self.model_thzh.generate( + **self.tokenizer_thzh(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer_thzh.decode(t, skip_special_tokens=True) + for t in self.translated + ][0]
+
+ + + +
+[docs] +class ZhThTranslator: + """ + Chinese-Thai Machine Translation + + from Lalita @ AI builder + + - GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth + - Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 + + :param bool use_gpu : load model using GPU (Default is False) + """ + +
+[docs] + def __init__( + self, + use_gpu: bool = False, + pretrained: str = "Lalita/marianmt-zh_cn-th", + ) -> None: + self.tokenizer_zhth = AutoTokenizer.from_pretrained(pretrained) + self.model_zhth = AutoModelForSeq2SeqLM.from_pretrained(pretrained) + if use_gpu: + self.model_zhth.cuda()
+ + +
+[docs] + def translate(self, text: str) -> str: + """ + Translate text from Chinese to Thai + + :param str text: input text in source language + :return: translated text in target language + :rtype: str + + :Example: + + Translate text from Chinese to Thai:: + + from pythainlp.translate import ZhThTranslator + + zhth = ZhThTranslator() + + zhth.translate("我爱你") + # output: ผมรักคุณนะ + + """ + self.translated = self.model_zhth.generate( + **self.tokenizer_zhth(text, return_tensors="pt", padding=True) + ) + return [ + self.tokenizer_zhth.decode(t, skip_special_tokens=True) + for t in self.translated + ][0]
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/transliterate/core.html b/5.1/_modules/pythainlp/transliterate/core.html new file mode 100644 index 0000000..339deed --- /dev/null +++ b/5.1/_modules/pythainlp/transliterate/core.html @@ -0,0 +1,356 @@ + + + + + + + + pythainlp.transliterate.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.transliterate.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+DEFAULT_ROMANIZE_ENGINE = "royin"
+DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
+DEFAULT_PRONUNCIATE_ENGINE = "w2p"
+
+
+
+[docs] +def romanize( + text: str, + engine: str = DEFAULT_ROMANIZE_ENGINE, + fallback_engine: str = DEFAULT_ROMANIZE_ENGINE, +) -> str: + """ + This function renders Thai words in the Latin alphabet or "romanization", + using the Royal Thai General System of Transcription (RTGS) + [#rtgs_transcription]_. RTGS is the official system published + by the Royal Institute of Thailand. (Thai: ถอดเสียงภาษาไทยเป็นอักษรละติน) + + :param str text: Thai text to be romanized + :param str engine: One of 'royin' (default), 'thai2rom', 'thai2rom_onnx, 'tltk', and 'lookup'. See more in options for engine section. + :param str fallback_engine: If engine equals 'lookup', use `fallback_engine` for words that are not in the transliteration dict. + No effect on other engines. Default to 'royin'. + + :return: A string of Thai words rendered in the Latin alphabet. + :rtype: str + + :Options for engines: + * *royin* - (default) based on the Royal Thai General System of + Transcription issued by Royal Institute of Thailand. + * *thai2rom* - a deep learning-based Thai romanization engine + (require PyTorch). + * *thai2rom_onnx* - a deep learning-based Thai romanization engine with ONNX runtime + * *tltk* - TLTK: Thai Language Toolkit + * *lookup* - Look up on Thai-English Transliteration dictionary v1.4 compiled by Wannaphong. + + :Example: + :: + + from pythainlp.transliterate import romanize + + romanize("สามารถ", engine="royin") + # output: 'samant' + + romanize("สามารถ", engine="thai2rom") + # output: 'samat' + + romanize("สามารถ", engine="tltk") + # output: 'samat' + + romanize("ภาพยนตร์", engine="royin") + # output: 'phapn' + + romanize("ภาพยนตร์", engine="thai2rom") + # output: 'phapphayon' + + romanize("ภาพยนตร์", engine="thai2rom_onnx") + # output: 'phapphayon' + + romanize("ก็อปปี้", engine="lookup") + # output: 'copy' + + """ + + def select_romanize_engine(engine: str): + if engine == "thai2rom": + from pythainlp.transliterate.thai2rom import romanize + elif engine == "thai2rom_onnx": + from pythainlp.transliterate.thai2rom_onnx import romanize + elif engine == "tltk": + from pythainlp.transliterate.tltk import romanize + else: # use default engine "royin" + from pythainlp.transliterate.royin import romanize + + return romanize + + if not text or not isinstance(text, str): + return "" + + if engine == "lookup": + from pythainlp.transliterate.lookup import romanize + + fallback = select_romanize_engine(fallback_engine) + return romanize(text, fallback_func=fallback) + else: + rom_engine = select_romanize_engine(engine) + trans_word = [] + for word in text.split(' '): + trans_word.append(rom_engine(word)) + new_word = ''.join(trans_word) + return new_word
+ + + +
+[docs] +def transliterate( + text: str, engine: str = DEFAULT_TRANSLITERATE_ENGINE +) -> str: + """ + This function transliterates Thai text. + + :param str text: Thai text to be transliterated + :param str engine: 'icu', 'ipa', or 'thaig2p' (default) + + :return: A string of phonetic alphabets indicating + how the input text should be pronounced. + :rtype: str + + :Options for engines: + * *thaig2p* - (default) Thai Grapheme-to-Phoneme, + output is IPA (require PyTorch) + * *icu* - pyicu, based on International Components for Unicode (ICU) + * *ipa* - epitran, output is International Phonetic Alphabet (IPA) + * *tltk_g2p* - Thai Grapheme-to-Phoneme from\ + `TLTK <https://pypi.org/project/tltk/>`_., + * *iso_11940* - Thai text into Latin characters with ISO 11940. + * *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA) + * *thaig2p_v2* - Thai Grapheme-to-Phoneme, + output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0 + + :Example: + :: + + from pythainlp.transliterate import transliterate + + transliterate("สามารถ", engine="icu") + # output: 's̄āmārt̄h' + + transliterate("สามารถ", engine="ipa") + # output: 'saːmaːrot' + + transliterate("สามารถ", engine="thaig2p") + # output: 's aː ˩˩˦ . m aː t̚ ˥˩' + + transliterate("สามารถ", engine="tltk_ipa") + # output: 'saː5.maːt3' + + transliterate("สามารถ", engine="tltk_g2p") + # output: 'saa4~maat2' + + transliterate("สามารถ", engine="iso_11940") + # output: 's̄āmārt̄h' + + transliterate("ภาพยนตร์", engine="icu") + # output: 'p̣hāphyntr̒' + + transliterate("ภาพยนตร์", engine="ipa") + # output: 'pʰaːpjanot' + + transliterate("ภาพยนตร์", engine="thaig2p") + # output: 'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧' + + transliterate("ภาพยนตร์", engine="iso_11940") + # output: 'p̣hāphyntr' + """ + + if not text or not isinstance(text, str): + return "" + + if engine in ("icu", "pyicu"): + from pythainlp.transliterate.pyicu import transliterate + elif engine == "ipa": + from pythainlp.transliterate.ipa import transliterate + elif engine == "tltk_g2p": + from pythainlp.transliterate.tltk import tltk_g2p as transliterate + elif engine == "tltk_ipa": + from pythainlp.transliterate.tltk import tltk_ipa as transliterate + elif engine == "iso_11940": + from pythainlp.transliterate.iso_11940 import transliterate + elif engine == "thaig2p_v2": + from pythainlp.transliterate.thaig2p_v2 import transliterate + else: # use default engine: "thaig2p" + from pythainlp.transliterate.thaig2p import transliterate + + return transliterate(text)
+ + + +
+[docs] +def pronunciate(word: str, engine: str = DEFAULT_PRONUNCIATE_ENGINE) -> str: + """ + This function pronunciates Thai word. + + :param str word: Thai text to be pronunciated + :param str engine: 'w2p' (default) + + :return: A string of Thai letters indicating + how the input text should be pronounced. + :rtype: str + + :Options for engines: + * *w2p* - Thai Word-to-Phoneme + + :Example: + :: + + from pythainlp.transliterate import pronunciate + + pronunciate("สามารถ", engine="w2p") + # output: 'สา-มาด' + + pronunciate("ภาพยนตร์", engine="w2p") + # output: 'พาบ-พะ-ยน' + """ + if not word or not isinstance(word, str): + return "" + + # if engine == "w2p": # has only one engine + from pythainlp.transliterate.w2p import pronunciate + + return pronunciate(word)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/transliterate/royin.html b/5.1/_modules/pythainlp/transliterate/royin.html new file mode 100644 index 0000000..30d37b5 --- /dev/null +++ b/5.1/_modules/pythainlp/transliterate/royin.html @@ -0,0 +1,362 @@ + + + + + + + + pythainlp.transliterate.royin — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.transliterate.royin

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+The Royal Thai General System of Transcription (RTGS)
+is the official system for rendering Thai words in the Latin alphabet.
+It was published by the Royal Institute of Thailand.
+
+:See Also:
+    * `Wikipedia <https://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription>`_
+"""
+import re
+
+from pythainlp import thai_consonants, word_tokenize
+
+# vowel
+_vowel_patterns = """เ*ียว,\\1iao
+แ*็ว,\\1aeo
+เ*ือย,\\1ueai
+แ*ว,\\1aeo
+เ*็ว,\\1eo
+เ*ว,\\1eo
+*ิว,\\1io
+*วย,\\1uai
+เ*ย,\\1oei
+*อย,\\1oi
+โ*ย,\\1oi
+*ุย,\\1ui
+*าย,\\1ai
+ไ*ย,\\1ai
+*ัย,\\1ai
+ไ**,\\1\\2ai
+ไ*,\\1ai
+ใ*,\\1ai
+*ว*,\\1ua\\2
+*ัวะ,\\1ua
+*ัว,\\1ua
+เ*ือะ,\\1uea
+เ*ือ,\\1uea
+เ*ียะ,\\1ia
+เ*ีย,\\1ia
+เ*อะ,\\1oe
+เ*อ,\\1oe
+เ*ิ,\\1oe
+*อ,\\1o
+เ*าะ,\\1o
+เ*็,\\1e
+โ*ะ,\\1o
+โ*,\\1o
+แ*ะ,\\1ae
+แ*,\\1ae
+เ*าะ,\\1e
+*าว,\\1ao
+เ*า,\\1ao
+เ*,\\1e
+*ู,\\1u
+*ุ,\\1u
+*ื,\\1ue
+*ึ,\\1ue
+*ี,\\1i
+*ิ,\\1i
+*ำ,\\1am
+*า,\\1a
+*ั,\\1a
+*ะ,\\1a
+#ฤ,\\1rue
+$ฤ,\\1ri"""
+_vowel_patterns = _vowel_patterns.replace("*", f"([{thai_consonants}])")
+_vowel_patterns = _vowel_patterns.replace("#", "([คนพมห])")
+_vowel_patterns = _vowel_patterns.replace("$", "([กตทปศส])")
+
+_VOWELS = [x.split(",") for x in _vowel_patterns.split("\n")]
+
+# พยัญชนะ ต้น สะกด
+_CONSONANTS = {
+    "ก": ["k", "k"],
+    "ข": ["kh", "k"],
+    "ฃ": ["kh", "k"],
+    "ค": ["kh", "k"],
+    "ฅ": ["kh", "k"],
+    "ฆ": ["kh", "k"],
+    "ง": ["ng", "ng"],
+    "จ": ["ch", "t"],
+    "ฉ": ["ch", "t"],
+    "ช": ["ch", "t"],
+    "ซ": ["s", "t"],
+    "ฌ": ["ch", "t"],
+    "ญ": ["y", "n"],
+    "ฎ": ["d", "t"],
+    "ฏ": ["t", "t"],
+    "ฐ": ["th", "t"],
+    # ฑ พยัญชนะต้น เป็น d ได้
+    "ฑ": ["th", "t"],
+    "ฒ": ["th", "t"],
+    "ณ": ["n", "n"],
+    "ด": ["d", "t"],
+    "ต": ["t", "t"],
+    "ถ": ["th", "t"],
+    "ท": ["th", "t"],
+    "ธ": ["th", "t"],
+    "น": ["n", "n"],
+    "บ": ["b", "p"],
+    "ป": ["p", "p"],
+    "ผ": ["ph", "p"],
+    "ฝ": ["f", "p"],
+    "พ": ["ph", "p"],
+    "ฟ": ["f", "p"],
+    "ภ": ["ph", "p"],
+    "ม": ["m", "m"],
+    "ย": ["y", ""],
+    "ร": ["r", "n"],
+    "ฤ": ["rue", ""],
+    "ล": ["l", "n"],
+    "ว": ["w", ""],
+    "ศ": ["s", "t"],
+    "ษ": ["s", "t"],
+    "ส": ["s", "t"],
+    "ห": ["h", ""],
+    "ฬ": ["l", "n"],
+    "อ": ["", ""],
+    "ฮ": ["h", ""],
+}
+
+_THANTHAKHAT = "\u0e4c"
+_RE_CONSONANT = re.compile(f"[{thai_consonants}]")
+_RE_NORMALIZE = re.compile(
+    f"จน์|มณ์|ณฑ์|ทร์|ตร์|[{thai_consonants}]{_THANTHAKHAT}|"
+    f"[{thai_consonants}][\u0e30-\u0e39]{_THANTHAKHAT}"
+    # Paiyannoi, Maiyamok, Tonemarks, Thanthakhat, Nikhahit, other signs
+    r"|[\u0e2f\u0e46\u0e48-\u0e4f\u0e5a\u0e5b]"
+)
+
+
+def _normalize(word: str) -> str:
+    """
+    Remove silence, no sound, and tonal characters.
+
+    ตัดอักษรที่ไม่ออกเสียง (การันต์ ไปยาลน้อย ไม้ยมก*) และวรรณยุกต์ทิ้ง
+    """
+    return _RE_NORMALIZE.sub("", word)
+
+
+def _replace_vowels(word: str) -> str:
+    for vowel in _VOWELS:
+        word = re.sub(vowel[0], vowel[1], word)
+
+    return word
+
+
+def _replace_consonants(word: str, consonants: str) -> str:
+    _HO_HIP = "\u0e2b"  # ห
+    _RO_RUA = "\u0e23"  # ร
+    _DOUBLE_RO_RUA = _RO_RUA + _RO_RUA
+
+    if not consonants:
+        return word
+
+    skip = False
+    mod_chars = []
+    j = 0  # j is the index of consonants
+    for i in range(len(word)):
+        if skip:
+            skip = False
+            j += 1
+        elif word[i] not in _CONSONANTS:  # word[i] is not a Thai consonant.
+            mod_chars.append(word[i])
+        elif (
+            len(mod_chars) == 0 and word[i] == _HO_HIP and len(consonants) != 1
+        ):  # Skip HO HIP except that HO HIP is the only one consonant
+            j += 1
+        elif (
+            len(mod_chars) == 0
+        ):  # The first character must be an initial consonant.
+            mod_chars.append(_CONSONANTS[consonants[j]][0])
+            j += 1
+        elif word[i:] == _DOUBLE_RO_RUA:  # Double RO RUA is in end of word
+            skip = True
+            mod_chars.append("a")
+            mod_chars.append("n")
+            j += 1
+        elif word[i : i + 2] == _DOUBLE_RO_RUA:
+            skip = True
+            mod_chars.append("a")
+            j += 1
+        else:  # Assume that the rest are final consonants.
+            mod_chars.append(_CONSONANTS[consonants[j]][1])
+            j += 1
+    return "".join(mod_chars)
+
+
+# support function for romanize()
+def _romanize(word: str) -> str:
+    word = _replace_vowels(_normalize(word))
+    consonants = _RE_CONSONANT.findall(word)
+
+    # 2-character word, all consonants
+    if len(word) == 2 and len(consonants) == 2:
+        word = list(word)
+        word.insert(1, "o")
+        word = "".join(word)
+
+    word = _replace_consonants(word, consonants)
+    return word
+
+
+
+[docs] +def romanize(text: str) -> str: + """Render Thai words in Latin alphabet, using RTGS + + Royal Thai General System of Transcription (RTGS), + is the official system by the Royal Institute of Thailand. + + :param text: Thai text to be romanized + :type text: str + :return: A string of Thai words rendered in the Latin alphabet + :rtype: str + """ + words = word_tokenize(text) + romanized_words = [_romanize(word) for word in words] + + return "".join(romanized_words)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/transliterate/spoonerism.html b/5.1/_modules/pythainlp/transliterate/spoonerism.html new file mode 100644 index 0000000..5fd8271 --- /dev/null +++ b/5.1/_modules/pythainlp/transliterate/spoonerism.html @@ -0,0 +1,220 @@ + + + + + + + + pythainlp.transliterate.spoonerism — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.transliterate.spoonerism

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from pythainlp import thai_consonants
+from pythainlp.transliterate import pronunciate
+
+_list_consonants = list(thai_consonants.replace("ห", ""))
+
+
+
+[docs] +def puan(word: str, show_pronunciation: bool = True) -> str: + """ + Thai Spoonerism + + This function converts Thai word to spoonerism word. + + :param str word: Thai word to be spoonerized + :param bool show_pronunciation: True (default) or False + + :return: A string of Thai spoonerism word. + :rtype: str + + :Example: + :: + + from pythainlp.transliterate import puan + + puan("นาริน") + # output: 'นิน-รา' + + puan("นาริน", False) + # output: 'นินรา' + """ + word = pronunciate(word, engine="w2p") + _list_char = [] + _list_pron = word.split("-") + _mix_list = "" + if len(_list_pron) == 1: + return word + if show_pronunciation: + _mix_list = "-" + for i in _list_pron: + for j in i: + if j in _list_consonants: + _list_char.append(j) + break + elif "ห" == j and "หฺ" not in i and len(i) == 2: + _list_char.append(j) + break + + list_w_char = list(zip(_list_pron, _list_char)) + _list_w = [] + if len(list_w_char) == 2: + _list_w.append( + list_w_char[1][0].replace(list_w_char[1][1], list_w_char[0][1], 1) + ) + _list_w.append( + list_w_char[0][0].replace(list_w_char[0][1], list_w_char[1][1], 1) + ) + elif len(list_w_char) == 3: + _list_w.append(_list_pron[0]) + _list_w.append( + list_w_char[2][0].replace(list_w_char[2][1], list_w_char[1][1], 1) + ) + _list_w.append( + list_w_char[1][0].replace(list_w_char[1][1], list_w_char[2][1], 1) + ) + else: # > 3 syllables + _list_w.append( + _list_pron[0].replace(list_w_char[0][1], list_w_char[-1][1], 1) + ) + for i in range(1, len(list_w_char) - 1): + _list_w.append(_list_pron[i]) + _list_w.append( + _list_pron[-1].replace(list_w_char[-1][1], list_w_char[0][1], 1) + ) + if not show_pronunciation: + _list_w = [i.replace("หฺ", "").replace("ฺ", "") for i in _list_w] + return _mix_list.join(_list_w)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/transliterate/thai2rom.html b/5.1/_modules/pythainlp/transliterate/thai2rom.html new file mode 100644 index 0000000..1e6d74d --- /dev/null +++ b/5.1/_modules/pythainlp/transliterate/thai2rom.html @@ -0,0 +1,505 @@ + + + + + + + + pythainlp.transliterate.thai2rom — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.transliterate.thai2rom

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Romanization of Thai words based on machine-learnt engine ("thai2rom")
+"""
+
+import random
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from pythainlp.corpus import get_corpus_path
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+_MODEL_NAME = "thai2rom-pytorch-attn"
+
+
+class ThaiTransliterator:
+    def __init__(self):
+        """
+        Transliteration of Thai words.
+
+        Now supports Thai to Latin (romanization)
+        """
+        # get the model, download it if it's not available locally
+        self.__model_filename = get_corpus_path(_MODEL_NAME)
+
+        loader = torch.load(self.__model_filename, map_location=device)
+
+        INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"]
+        OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"]
+
+        self._maxlength = 100
+
+        self._char_to_ix = loader["char_to_ix"]
+        self._ix_to_char = loader["ix_to_char"]
+        self._target_char_to_ix = loader["target_char_to_ix"]
+        self._ix_to_target_char = loader["ix_to_target_char"]
+
+        # encoder/ decoder
+        # Restore the model and construct the encoder and decoder.
+        self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT)
+
+        self._decoder = AttentionDecoder(
+            OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT
+        )
+
+        self._network = Seq2Seq(
+            self._encoder,
+            self._decoder,
+            self._target_char_to_ix["<start>"],
+            self._target_char_to_ix["<end>"],
+            self._maxlength,
+        ).to(device)
+
+        self._network.load_state_dict(loader["model_state_dict"])
+        self._network.eval()
+
+    def _prepare_sequence_in(self, text: str):
+        """
+        Prepare input sequence for PyTorch
+        """
+        idxs = []
+        for ch in text:
+            if ch in self._char_to_ix:
+                idxs.append(self._char_to_ix[ch])
+            else:
+                idxs.append(self._char_to_ix["<UNK>"])
+        idxs.append(self._char_to_ix["<end>"])
+        tensor = torch.tensor(idxs, dtype=torch.long)
+        return tensor.to(device)
+
+    def romanize(self, text: str) -> str:
+        """
+        :param str text: Thai text to be romanized
+        :return: English (more or less) text that spells out how the Thai text
+                 should be pronounced.
+        """
+        input_tensor = self._prepare_sequence_in(text).view(1, -1)
+        input_length = torch.Tensor([len(text) + 1]).int()
+        target_tensor_logits = self._network(
+            input_tensor, input_length, None, 0
+        )
+
+        # Seq2seq model returns <END> as the first token,
+        # As a result, target_tensor_logits.size() is torch.Size([0])
+        if target_tensor_logits.size(0) == 0:
+            target = ["<PAD>"]
+        else:
+            target_tensor = (
+                torch.argmax(target_tensor_logits.squeeze(1), 1)
+                .cpu()
+                .detach()
+                .numpy()
+            )
+            target = [self._ix_to_target_char[t] for t in target_tensor]
+
+        return "".join(target)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self, vocabulary_size, embedding_size, hidden_size, dropout=0.5
+    ):
+        """Constructor"""
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.character_embedding = nn.Embedding(
+            vocabulary_size, embedding_size
+        )
+        self.rnn = nn.LSTM(
+            input_size=embedding_size,
+            hidden_size=hidden_size // 2,
+            bidirectional=True,
+            batch_first=True,
+        )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, sequences, sequences_lengths):
+        # sequences: (batch_size, sequence_length=MAX_LENGTH)
+        # sequences_lengths: (batch_size)
+
+        batch_size = sequences.size(0)
+        hidden = self.init_hidden(batch_size)
+
+        sequences_lengths = torch.flip(
+            torch.sort(sequences_lengths).values, dims=(0,)
+        )
+        index_sorted = torch.sort(-1 * sequences_lengths).indices
+        index_unsort = torch.sort(index_sorted).indices  # to unsorted sequence
+        sequences = sequences.index_select(0, index_sorted.to(device))
+
+        sequences = self.character_embedding(sequences)
+        sequences = self.dropout(sequences)
+
+        sequences_packed = nn.utils.rnn.pack_padded_sequence(
+            sequences, sequences_lengths.clone(), batch_first=True
+        )
+
+        sequences_output, hidden = self.rnn(sequences_packed, hidden)
+
+        sequences_output, _ = nn.utils.rnn.pad_packed_sequence(
+            sequences_output, batch_first=True
+        )
+
+        sequences_output = sequences_output.index_select(
+            0, index_unsort.clone().detach()
+        )
+        return sequences_output, hidden
+
+    def init_hidden(self, batch_size):
+        h_0 = torch.zeros(
+            [2, batch_size, self.hidden_size // 2], requires_grad=True
+        ).to(device)
+        c_0 = torch.zeros(
+            [2, batch_size, self.hidden_size // 2], requires_grad=True
+        ).to(device)
+
+        return (h_0, c_0)
+
+
+class Attn(nn.Module):
+    def __init__(self, method, hidden_size):
+        super().__init__()
+
+        self.method = method
+        self.hidden_size = hidden_size
+
+        if self.method == "general":
+            self.attn = nn.Linear(self.hidden_size, hidden_size)
+
+        elif self.method == "concat":
+            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
+            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
+
+    def forward(self, hidden, encoder_outputs, mask):
+        # Calculate energies for each encoder output
+        if self.method == "dot":
+            attn_energies = torch.bmm(
+                encoder_outputs, hidden.transpose(1, 2)
+            ).squeeze(2)
+        elif self.method == "general":
+            attn_energies = self.attn(
+                encoder_outputs.view(-1, encoder_outputs.size(-1))
+            )  # (batch_size * sequence_len,  hidden_size)
+            attn_energies = torch.bmm(
+                attn_energies.view(*encoder_outputs.size()),
+                hidden.transpose(1, 2),
+            ).squeeze(2)  # (batch_size,  sequence_len)
+        elif self.method == "concat":
+            attn_energies = self.attn(
+                torch.cat(
+                    (hidden.expand(*encoder_outputs.size()), encoder_outputs),
+                    2,
+                )
+            )  # (batch_size, sequence_len,  hidden_size)
+            attn_energies = torch.bmm(
+                attn_energies,
+                self.other.unsqueeze(0).expand(*hidden.size()).transpose(1, 2),
+            ).squeeze(2)
+
+        attn_energies = attn_energies.masked_fill(mask == 0, -1e10)
+
+        # Normalize energies to weights in range 0 to 1
+        return F.softmax(attn_energies, 1)
+
+
+class AttentionDecoder(nn.Module):
+    def __init__(
+        self, vocabulary_size, embedding_size, hidden_size, dropout=0.5
+    ):
+        """Constructor"""
+        super().__init__()
+        self.vocabulary_size = vocabulary_size
+        self.hidden_size = hidden_size
+        self.character_embedding = nn.Embedding(
+            vocabulary_size, embedding_size
+        )
+        self.rnn = nn.LSTM(
+            input_size=embedding_size + self.hidden_size,
+            hidden_size=hidden_size,
+            bidirectional=False,
+            batch_first=True,
+        )
+
+        self.attn = Attn(method="general", hidden_size=self.hidden_size)
+        self.linear = nn.Linear(hidden_size, vocabulary_size)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input_character, last_hidden, encoder_outputs, mask):
+        """Defines the forward computation of the decoder"""
+
+        # input_character: (batch_size, 1)
+        # last_hidden: (batch_size, hidden_dim)
+        # encoder_outputs: (batch_size, sequence_len, hidden_dim)
+        # mask: (batch_size, sequence_len)
+
+        hidden = last_hidden.permute(1, 0, 2)
+        attn_weights = self.attn(hidden, encoder_outputs, mask)
+
+        context_vector = attn_weights.unsqueeze(1).bmm(encoder_outputs)
+        context_vector = torch.sum(context_vector, dim=1)
+        context_vector = context_vector.unsqueeze(1)
+
+        embedded = self.character_embedding(input_character)
+        embedded = self.dropout(embedded)
+
+        rnn_input = torch.cat((context_vector, embedded), -1)
+
+        output, hidden = self.rnn(rnn_input)
+        output = output.view(-1, output.size(2))
+
+        x = self.linear(output)
+
+        return x, hidden[0], attn_weights
+
+
+class Seq2Seq(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        target_start_token,
+        target_end_token,
+        max_length,
+    ):
+        super().__init__()
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pad_idx = 0
+        self.target_start_token = target_start_token
+        self.target_end_token = target_end_token
+        self.max_length = max_length
+
+        assert encoder.hidden_size == decoder.hidden_size
+
+    def create_mask(self, source_seq):
+        mask = source_seq != self.pad_idx
+        return mask
+
+    def forward(
+        self, source_seq, source_seq_len, target_seq, teacher_forcing_ratio=0.5
+    ):
+        # source_seq: (batch_size, MAX_LENGTH)
+        # source_seq_len: (batch_size, 1)
+        # target_seq: (batch_size, MAX_LENGTH)
+
+        batch_size = source_seq.size(0)
+        start_token = self.target_start_token
+        end_token = self.target_end_token
+        max_len = self.max_length
+        target_vocab_size = self.decoder.vocabulary_size
+
+        outputs = torch.zeros(max_len, batch_size, target_vocab_size).to(
+            device
+        )
+
+        if target_seq is None:
+            assert teacher_forcing_ratio == 0, "Must be zero during inference"
+            inference = True
+        else:
+            inference = False
+
+        encoder_outputs, encoder_hidden = self.encoder(
+            source_seq, source_seq_len
+        )
+
+        decoder_input = (
+            torch.tensor([[start_token] * batch_size])
+            .view(batch_size, 1)
+            .to(device)
+        )
+
+        encoder_hidden_h_t = torch.cat(
+            [encoder_hidden[0][0], encoder_hidden[0][1]], dim=1
+        ).unsqueeze(dim=0)
+        decoder_hidden = encoder_hidden_h_t
+
+        max_source_len = encoder_outputs.size(1)
+        mask = self.create_mask(source_seq[:, 0:max_source_len])
+
+        for di in range(max_len):
+            decoder_output, decoder_hidden, _ = self.decoder(
+                decoder_input, decoder_hidden, encoder_outputs, mask
+            )
+
+            _, topi = decoder_output.topk(1)
+            outputs[di] = decoder_output.to(device)
+
+            teacher_force = random.random() < teacher_forcing_ratio
+
+            decoder_input = (
+                target_seq[:, di].reshape(batch_size, 1)
+                if teacher_force
+                else topi.detach()
+            )
+
+            decoder_input = topi.detach()
+
+            if inference and decoder_input == end_token:
+                return outputs[:di]
+
+        return outputs
+
+
+_THAI_TO_ROM = ThaiTransliterator()
+
+
+
+[docs] +def romanize(text: str) -> str: + """Romanize Thai text + + :param text: Thai text to be romanized + :type text: str + :return: Roman characters representing the pronunciation of the Thai text + :rtype: str + """ + return _THAI_TO_ROM.romanize(text)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/transliterate/wunsen.html b/5.1/_modules/pythainlp/transliterate/wunsen.html new file mode 100644 index 0000000..73ab2eb --- /dev/null +++ b/5.1/_modules/pythainlp/transliterate/wunsen.html @@ -0,0 +1,294 @@ + + + + + + + + pythainlp.transliterate.wunsen — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.transliterate.wunsen

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
+to Thai text
+By Wunsen
+
+:See Also:
+    * `GitHub \
+        <https://github.com/cakimpei/wunsen>`_
+"""
+from wunsen import ThapSap
+
+
+
+[docs] +class WunsenTransliterate: + """ + Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text + to Thai text + by Wunsen + + :See Also: + * `GitHub \ + <https://github.com/cakimpei/wunsen>`_ + """ + +
+[docs] + def __init__(self) -> None: + self.thap_value = None + self.lang = None + self.jp_input = None + self.zh_sandhi = None + self.system = None
+ + +
+[docs] + def transliterate( + self, + text: str, + lang: str, + jp_input: str = None, + zh_sandhi: bool = None, + system: str = None, + ): + """ + Use Wunsen for transliteration + + :param str text: text to be transliterated to Thai text. + :param str lang: source language + :param str jp_input: Japanese input method (for Japanese only) + :param bool zh_sandhi: Mandarin third tone sandhi option + (for Mandarin only) + :param str system: transliteration system (for Japanese and + Mandarin only) + + :return: Thai text + :rtype: str + + :Options for lang: + * *jp* - Japanese (from Hepburn romanization) + * *ko* - Korean (from Revised Romanization) + * *vi* - Vietnamese (Latin script) + * *zh* - Mandarin (from Hanyu Pinyin) + :Options for jp_input: + * *Hepburn-no diacritic* - Hepburn-no diacritic (without macron) + :Options for zh_sandhi: + * *True* - apply third tone sandhi rule + * *False* - do not apply third tone sandhi rule + :Options for system: + * *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) + * *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น + (ราชบัณฑิตยสถาน พ.ศ. 2535) + * *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน + (ราชบัณฑิตยสถาน พ.ศ. 2549) + * *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน + ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร + ภาษาจีน พ.ศ. 2543) + + :Example: + :: + + from pythainlp.transliterate.wunsen import WunsenTransliterate + + wt = WunsenTransliterate() + + wt.transliterate("ohayō", lang="jp") + # output: 'โอฮาโย' + + wt.transliterate( + "ohayou", + lang="jp", + jp_input="Hepburn-no diacritic" + ) + # output: 'โอฮาโย' + + wt.transliterate("ohayō", lang="jp", system="RI35") + # output: 'โอะฮะโย' + + wt.transliterate("annyeonghaseyo", lang="ko") + # output: 'อันนย็องฮาเซโย' + + wt.transliterate("xin chào", lang="vi") + # output: 'ซีน จ่าว' + + wt.transliterate("ni3 hao3", lang="zh") + # output: 'หนี เห่า' + + wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False) + # output: 'หนี่ เห่า' + + wt.transliterate("ni3 hao3", lang="zh", system="RI49") + # output: 'หนี ห่าว' + """ + if ( + self.lang != lang + or self.jp_input != jp_input + or self.zh_sandhi != zh_sandhi + or self.system != system + ): + if lang == "jp": + self.jp_input = jp_input + self.zh_sandhi = None + self.system = system + elif lang == "zh": + self.jp_input = None + self.zh_sandhi = zh_sandhi + self.system = system + elif lang in ("ko", "vi"): + self.jp_input = None + self.zh_sandhi = None + self.system = None + else: + raise NotImplementedError( + "The %s language is not implemented." % lang + ) + self.lang = lang + input_lang = lang + if input_lang == "jp": + input_lang = "ja" + setting = {} + if self.jp_input is not None: + setting.update({"input": self.jp_input}) + if self.zh_sandhi is not None: + setting.update({"option": {"sandhi": self.zh_sandhi}}) + if self.system is not None: + setting.update({"system": self.system}) + self.thap_value = ThapSap(input_lang, **setting) + return self.thap_value.thap(text)
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/ulmfit/core.html b/5.1/_modules/pythainlp/ulmfit/core.html new file mode 100644 index 0000000..f5d7ee5 --- /dev/null +++ b/5.1/_modules/pythainlp/ulmfit/core.html @@ -0,0 +1,407 @@ + + + + + + + + pythainlp.ulmfit.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.ulmfit.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Universal Language Model Fine-tuning for Text Classification (ULMFiT).
+"""
+import collections
+from typing import Callable, Collection
+
+import numpy as np
+import torch
+
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import THAI2FIT_TOKENIZER
+from pythainlp.ulmfit.preprocess import (
+    fix_html,
+    lowercase_all,
+    remove_space,
+    replace_rep_after,
+    replace_rep_nonum,
+    replace_url,
+    replace_wrep_post,
+    replace_wrep_post_nonum,
+    rm_brackets,
+    rm_useless_newlines,
+    rm_useless_spaces,
+    spec_add_spaces,
+    ungroup_emoji,
+)
+from pythainlp.util import reorder_vowels
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+_MODEL_NAME_LSTM = "wiki_lm_lstm"
+_ITOS_NAME_LSTM = "wiki_itos_lstm"
+
+
+# Pretrained model paths
+THWIKI_LSTM = {
+    "wgts_fname": get_corpus_path(_MODEL_NAME_LSTM),
+    "itos_fname": get_corpus_path(_ITOS_NAME_LSTM),
+}
+
+# Preprocessing rules for Thai text
+# dense features
+pre_rules_th = [
+    replace_rep_after,
+    fix_html,
+    reorder_vowels,
+    spec_add_spaces,
+    rm_useless_spaces,
+    rm_useless_newlines,
+    rm_brackets,
+    replace_url,
+]
+post_rules_th = [replace_wrep_post, ungroup_emoji, lowercase_all]
+
+# sparse features
+pre_rules_th_sparse = pre_rules_th[1:] + [replace_rep_nonum]
+post_rules_th_sparse = post_rules_th[1:] + [
+    replace_wrep_post_nonum,
+    remove_space,
+]
+
+
+
+[docs] +def process_thai( + text: str, + pre_rules: Collection = pre_rules_th_sparse, + tok_func: Callable = THAI2FIT_TOKENIZER.word_tokenize, + post_rules: Collection = post_rules_th_sparse, +) -> Collection[str]: + """ + Process Thai texts for models (with sparse features as default) + + :param str text: text to be cleaned + :param list[func] pre_rules: rules to apply before tokenization. + :param func tok_func: tokenization function (by default, **tok_func** is + :func:`pythainlp.tokenize.word_tokenize`) + + :param list[func] post_rules: rules to apply after tokenizations + + :return: a list of cleaned tokenized texts + :rtype: list[str] + + + :Note: + - The default **pre-rules** consists of :func:`fix_html`, + :func:`pythainlp.util.normalize`, + :func:`spec_add_spaces`, + :func:`rm_useless_spaces`, + :func:`rm_useless_newlines`, + :func:`rm_brackets` + and :func:`replace_rep_nonum`. + + - The default **post-rules** consists of :func:`ungroup_emoji`, + :func:`lowercase_all`, :func:`replace_wrep_post_nonum`, + and :func:`remove_space`. + + :Example: + + 1. Use default pre-rules and post-rules: + + >>> from pythainlp.ulmfit import process_thai + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text) + [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'pythainlp', '&'] + + 2. Modify pre_rules and post_rules arguments with + rules provided in :mod:`pythainlp.ulmfit`: + + >>> from pythainlp.ulmfit import ( + process_thai, + replace_rep_after, + fix_html, + ungroup_emoji, + replace_wrep_post, + remove_space) + >>> + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text, + pre_rules=[replace_rep_after, fix_html], + post_rules=[ungroup_emoji, + replace_wrep_post, + remove_space] + ) + ['บ้าน', 'xxrep', '5', '()', 'อยู่', 'xxwrep', '2', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'PyThaiNLP', '&'] + + + """ + res = text + + for rule in pre_rules: + res = rule(res) + res = tok_func(res) + for rule in post_rules: + res = rule(res) + + return res
+ + + +
+[docs] +def document_vector(text: str, learn, data, agg: str = "mean"): + """ + This function vectorizes Thai input text into a 400 dimension vector using + :class:`fastai` language model and data bunch. + + :meth: `document_vector` get document vector using fastai language model + and data bunch + :param str text: text to be vectorized with :class:`fastai` language model. + :param learn: :class:`fastai` language model learner + :param data: :class:`fastai` data bunch + :param str agg: name of aggregation methods for word embeddings + The available methods are "mean" and "sum" + + :return: :class:`numpy.array` of document vector sized 400 based on + the encoder of the model + :rtype: :class:`numpy.ndarray((1, 400))` + + :Example: + + >>> from pythainlp.ulmfit import document_vectorr + >>> from fastai import * + >>> from fastai.text import * + >>> + >>> # Load Data Bunch + >>> data = load_data(MODEL_PATH, 'thwiki_lm_data.pkl') + >>> + >>> # Initialize language_model_learner + >>> config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, + qrnn=False, tie_weights=True, out_bias=True, output_p=0.25, + hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15) + >>> trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) + >>> learn = language_model_learner(data, AWD_LSTM, config=config, + pretrained=False, **trn_args) + >>> document_vector('วันนี้วันดีปีใหม่', learn, data) + + :See Also: + * A notebook showing how to train `ulmfit` language model and its + usage, `Jupyter Notebook \ + <https://github.com/cstorm125/thai2fit/blob/master/thwiki_lm/word2vec_examples.ipynb>`_ + + """ + + s = THAI2FIT_TOKENIZER.word_tokenize(text) + t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to( + device + ) + m = learn.model[0].encoder.to(device) + res = m(t).cpu().detach().numpy() + if agg == "mean": + res = res.mean(0) + elif agg == "sum": + res = res.sum(0) + else: + raise ValueError("Aggregate by mean or sum") + + return res
+ + + +
+[docs] +def merge_wgts(em_sz, wgts, itos_pre, itos_new): + """ + This function is to insert new vocab into an existing model named `wgts` + and update the model's weights for new vocab with the average embedding. + + :meth: `merge_wgts` insert pretrained weights and vocab into a new set + of weights and vocab; use average if vocab not in pretrained vocab + :param int em_sz: embedding size + :param wgts: torch model weights + :param list itos_pre: pretrained list of vocab + :param list itos_new: list of new vocab + + :return: merged torch model weights + + :Example: + :: + + from pythainlp.ulmfit import merge_wgts + import torch + + wgts = {'0.encoder.weight': torch.randn(5,3)} + itos_pre = ["แมว", "คน", "หนู"] + itos_new = ["ปลา", "เต่า", "นก"] + em_sz = 3 + + merge_wgts(em_sz, wgts, itos_pre, itos_new) + # output: + # {'0.encoder.weight': tensor([[0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011]]), + # '0.encoder_dp.emb.weight': tensor([[0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011]]), + # '1.decoder.weight': tensor([[0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011], + # [0.5952, 0.4453, 0.0011]])} + """ + vocab_size = len(itos_new) + enc_wgts = wgts["0.encoder.weight"].numpy() + + # Average weight of encoding + row_m = enc_wgts.mean(0) + stoi_pre = collections.defaultdict( + lambda: -1, {v: k for k, v in enumerate(itos_pre)} + ) + + # New embedding based on classification dataset + new_w = np.zeros((vocab_size, em_sz), dtype=np.float32) + + for i, w in enumerate(itos_new): + r = stoi_pre[w] + # Use pretrianed embedding if present; else use the average + new_w[i] = enc_wgts[r] if r >= 0 else row_m + + wgts["0.encoder.weight"] = torch.tensor(new_w) + wgts["0.encoder_dp.emb.weight"] = torch.tensor(np.copy(new_w)) + wgts["1.decoder.weight"] = torch.tensor(np.copy(new_w)) + + return wgts
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/ulmfit/preprocess.html b/5.1/_modules/pythainlp/ulmfit/preprocess.html new file mode 100644 index 0000000..b77fe5a --- /dev/null +++ b/5.1/_modules/pythainlp/ulmfit/preprocess.html @@ -0,0 +1,465 @@ + + + + + + + + pythainlp.ulmfit.preprocess — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.ulmfit.preprocess

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Preprocessing for ULMFiT
+"""
+import html
+import re
+from typing import Collection, List
+
+import emoji
+
+_TK_UNK = "xxunk"
+_TK_REP = "xxrep"
+_TK_WREP = "xxwrep"
+_TK_END = "xxend"
+_TK_URL = "xxurl"
+
+
+def replace_url(text: str) -> str:
+    """
+    Replace URL in `text` with TK_URL
+
+    :param str text: text to replace URL in
+
+    :return: text with URLs replaced
+    :rtype: str
+
+    :Example:
+
+        >>> from pythainlp.ulmfit import replace_url
+        >>> replace_url("go to github.com")
+        go to xxurl
+    """
+    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
+    return re.sub(URL_PATTERN, _TK_URL, text)
+
+
+
+[docs] +def fix_html(text: str) -> str: + """ + Replace HTML strings in `test`. (codes from `fastai`) + + :param str text: text to replace HTML strings in + + :return: text with HTML strings replaced + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import fix_html + >>> fix_html("Anbsp;amp;nbsp;B @.@ ") + A & B. + """ + re1 = re.compile(r" +") + text = ( + text.replace("#39;", "'") + .replace("amp;", "&") + .replace("#146;", "'") + .replace("nbsp;", " ") + .replace("#36;", "$") + .replace("\\n", "\n") + .replace("quot;", "'") + .replace("<br />", "\n") + .replace('\\"', '"') + .replace("<unk>", _TK_UNK) + .replace(" @.@ ", ".") + .replace(" @-@ ", "-") + .replace(" @,@ ", ",") + .replace("\\", " \\ ") + ) + return re1.sub(" ", html.unescape(text))
+ + + +
+[docs] +def rm_useless_spaces(text: str) -> str: + """Remove multiple spaces in `text`. (codes from `fastai`)""" + return re.sub(" {2,}", " ", text)
+ + + +
+[docs] +def spec_add_spaces(text: str) -> str: + """Add spaces around / and # in `text`. \n (codes from `fastai`)""" + return re.sub(r"([/#\n])", r" \1 ", text)
+ + + +
+[docs] +def replace_rep_after(text: str) -> str: + """ + Replace repetitions at the character level in `text` after the repeated character. + This is to prevent cases such as 'น้อยยยยยยยย' becomes 'น้อ xxrep 8 ย' + ; instead it will retain the word as 'น้อย xxrep 8' + + :param str text: input text to replace character repetitions in + + :return: text with repetitive token **xxrep** and the counter + after the repeated character + + :rtype: str + :Example: + + >>> from pythainlp.ulmfit import replace_rep_after + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_after(text) + 'กาxxrep7 ' + """ + + def _replace_rep(m): + c, cc = m.groups() + return f"{c}{_TK_REP}{len(cc)+1} " + + re_rep = re.compile(r"(\S)(\1{3,})") + + return re_rep.sub(_replace_rep, text)
+ + + +
+[docs] +def replace_wrep_post(toks: Collection[str]) -> List[str]: + """ + Replace repetitive words after tokenization; + fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token and the counter + is added before repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post(toks) + ['กา', 'xxwrep', '3', 'น้ำ'] + + """ + previous_word = None + rep_count = 0 + res = [] + for current_word in toks + [_TK_END]: + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [_TK_WREP, str(rep_count), previous_word] + rep_count = 0 + else: + res.append(previous_word) + previous_word = current_word + return res[1:]
+ + + +
+[docs] +def rm_useless_newlines(text: str) -> str: + "Remove multiple newlines in `text`." + + return re.sub(r"[\n]{2,}", " ", text)
+ + + +
+[docs] +def rm_brackets(text: str) -> str: + "Remove all empty brackets and artifacts within brackets from `text`." + # remove empty brackets + new_line = re.sub(r"\(\)", "", text) + new_line = re.sub(r"\{\}", "", new_line) + new_line = re.sub(r"\[\]", "", new_line) + # brackets with only punctuation marks + new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) + new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) + new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) + # artifacts after ( + new_line = re.sub( + r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + # artifacts before ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line + ) + return new_line
+ + + +
+[docs] +def ungroup_emoji(toks: Collection[str]) -> List[str]: + """ + Ungroup Zero Width Joiner (ZVJ) Emojis + + See https://emojipedia.org/emoji-zwj-sequence/ + """ + res = [] + for tok in toks: + if emoji.emoji_count(tok) == len(tok): + res.extend(list(tok)) + else: + res.append(tok) + return res
+ + + +
+[docs] +def lowercase_all(toks: Collection[str]) -> List[str]: + """ + Lowercase all English words; + English words in Thai texts don't usually have nuances of capitalization. + """ + return [tok.lower() for tok in toks]
+ + + +
+[docs] +def replace_rep_nonum(text: str) -> str: + """ + Replace repetitions at the character level in `text` after the repetition. + This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep ย'; + instead it will retain the word as 'น้อย xxrep ' + + :param str text: input text to replace character repetition + + :return: text with repetitive token **xxrep** after + character repetition + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import replace_rep_nonum + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_nonum(text) + 'กา xxrep ' + + """ + + def _replace_rep(m): + c, _ = m.groups() + return f"{c} {_TK_REP} " + + re_rep = re.compile(r"(\S)(\1{3,})") + return re_rep.sub(_replace_rep, text)
+ + + +
+[docs] +def replace_wrep_post_nonum(toks: Collection[str]) -> List[str]: + """ + Replace reptitive words post tokenization; + fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token is added in front of + repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post_nonum(toks) + ['กา', 'xxwrep', 'น้ำ'] + + """ + previous_word = None + rep_count = 0 + res = [] + for current_word in toks + [_TK_END]: + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [_TK_WREP, previous_word] + rep_count = 0 + else: + res.append(previous_word) + previous_word = current_word + return res[1:]
+ + + +
+[docs] +def remove_space(toks: Collection[str]) -> List[str]: + """ + Do not include space for bag-of-word models. + + :param list[str] toks: list of tokens + + :return: list of tokens where space tokens (" ") are filtered out + :rtype: list[str] + """ + res = [] + for t in toks: + t = t.strip() + if t: + res.append(t) + return res
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/ulmfit/tokenizer.html b/5.1/_modules/pythainlp/ulmfit/tokenizer.html new file mode 100644 index 0000000..e3f0cd1 --- /dev/null +++ b/5.1/_modules/pythainlp/ulmfit/tokenizer.html @@ -0,0 +1,221 @@ + + + + + + + + pythainlp.ulmfit.tokenizer — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.ulmfit.tokenizer

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tokenzier classes for ULMFiT
+"""
+
+from typing import Collection, List
+
+from pythainlp.tokenize import THAI2FIT_TOKENIZER
+
+
+class BaseTokenizer:
+    """Basic class for a tokenizer function. (codes from `fastai`)"""
+
+    def __init__(self, lang: str):
+        self.lang = lang
+
+    def tokenizer(self, t: str) -> List[str]:
+        return t.split(" ")
+
+    def add_special_cases(self, toks: Collection[str]):
+        pass
+
+
+
+[docs] +class ThaiTokenizer(BaseTokenizer): + """ + Wrapper around a frozen newmm tokenizer to make it a + :class:`fastai.BaseTokenizer`. + (see: https://docs.fast.ai/text.transform#BaseTokenizer) + """ + +
+[docs] + def __init__(self, lang: str = "th"): + self.lang = lang
+ + +
+[docs] + @staticmethod + def tokenizer(text: str) -> List[str]: + """ + This function tokenizes text using *newmm* engine and the dictionary + specifically for `ulmfit` related functions + (see: `Dictionary file (.txt) \ + <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th_thai2fit_201810.txt>`_). + :meth: tokenize text using a frozen newmm engine + :param str text: text to tokenize + :return: tokenized text + :rtype: list[str] + + :Example: + + Using :func:`pythainlp.ulmfit.ThaiTokenizer.tokenizer` is + similar to :func:`pythainlp.tokenize.word_tokenize` + using *ulmfit* engine. + + >>> from pythainlp.ulmfit import ThaiTokenizer + >>> from pythainlp.tokenize import word_tokenize + >>> + >>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา" + >>> ThaiTokenizer.tokenizer(text) + ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', + ' ', 'ภาวนามยปัญญา'] + >>> + >>> word_tokenize(text, engine='ulmfit') + ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', + ' ', 'ภาวนามยปัญญา'] + + """ + return THAI2FIT_TOKENIZER.word_tokenize(text)
+ + +
+[docs] + def add_special_cases(self, toks): + pass
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/abbreviation.html b/5.1/_modules/pythainlp/util/abbreviation.html new file mode 100644 index 0000000..d2efaf0 --- /dev/null +++ b/5.1/_modules/pythainlp/util/abbreviation.html @@ -0,0 +1,187 @@ + + + + + + + + pythainlp.util.abbreviation — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.abbreviation

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai abbreviation tools
+"""
+from typing import List, Tuple, Union
+
+
+
+[docs] +def abbreviation_to_full_text(text: str, top_k: int=2) -> List[Tuple[str, Union[float, None]]]: + """ + This function converts Thai text (with abbreviation) to full text. + + This function uses KhamYo for handles abbreviations. + See more `KhamYo <https://github.com/wannaphong/KhamYo>`_. + + :param str text: Thai text + :param int top_k: Top K + :return: Thai full text with abbreviations converted to full text and cos scores (original text - modified text). + :rtype: List[Tuple[str, Union[float, None]]] + + :Example: + :: + + from pythainlp.util import abbreviation_to_full_text + + text = "รร.ของเราน่าอยู่" + + abbreviation_to_full_text(text) + # output: [ + # ('โรงเรียนของเราน่าอยู่', tensor(0.3734)), + # ('โรงแรมของเราน่าอยู่', tensor(0.2438)) + # ] + """ + try: + from khamyo import replace as _replace + except ImportError: + raise ImportError( + """ + This function needs to use khamyo. + You can install by pip install khamyo or + pip install pythainlp[abbreviation]. + """ + ) + return _replace(text, top_k=top_k)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/collate.html b/5.1/_modules/pythainlp/util/collate.html new file mode 100644 index 0000000..351647c --- /dev/null +++ b/5.1/_modules/pythainlp/util/collate.html @@ -0,0 +1,195 @@ + + + + + + + + pythainlp.util.collate — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.collate

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai collation (sorted according to Thai dictionary order)
+Simple implementation using regular expressions
+"""
+import re
+from typing import Iterable, List
+
+_RE_TONE = re.compile(r"[็-์]")
+_RE_LV_C = re.compile(r"([เ-ไ])([ก-ฮ])")
+
+
+def _thkey(word: str) -> str:
+    cv = _RE_TONE.sub("", word)  # remove tone
+    cv = _RE_LV_C.sub("\\2\\1", cv)  # switch lead vowel
+
+    tone_match = _RE_TONE.search(word)
+    tone = tone_match.group() if tone_match else ""
+    return cv + tone
+
+
+
+[docs] +def collate(data: Iterable, reverse: bool = False) -> List[str]: + """ + This function sorts strings (almost) according to Thai dictionary. + + Important notes: this implementation ignores tone marks and symbols + + :param data: a list of words to be sorted + :type data: Iterable + :param reverse: If `reverse` is set to **True** the result will be + sorted in descending order. Otherwise, the result + will be sorted in ascending order, defaults to False + :type reverse: bool, optional + + :return: a list of strings, sorted alphabetically, (almost) according to + Thai dictionary + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import collate + + collate(['ไก่', 'เกิด', 'กาล', 'เป็ด', 'หมู', 'วัว', 'วันที่']) + # output: ['กาล', 'เกิด', 'ไก่', 'เป็ด', 'วันที่', 'วัว', 'หมู'] + + collate(['ไก่', 'เกิด', 'กาล', 'เป็ด', 'หมู', 'วัว', 'วันที่'], \\ + reverse=True) + # output: ['หมู', 'วัว', 'วันที่', 'เป็ด', 'ไก่', 'เกิด', 'กาล'] + """ + return sorted(data, key=_thkey, reverse=reverse)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/date.html b/5.1/_modules/pythainlp/util/date.html new file mode 100644 index 0000000..a190b11 --- /dev/null +++ b/5.1/_modules/pythainlp/util/date.html @@ -0,0 +1,551 @@ + + + + + + + + pythainlp.util.date — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.date

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai date/time conversion.
+
+Note: It does not take into account the change of new year's day in Thailand
+"""
+
+# BE คือ พ.ศ.
+# AD คือ ค.ศ.
+# AH ปีฮิจเราะห์ศักราชเป็นปีพุทธศักราช จะต้องบวกด้วย 1122
+# ไม่ได้รองรับปี พ.ศ. ก่อนการเปลี่ยนวันขึ้นปีใหม่ของประเทศไทย
+
+__all__ = [
+    "convert_years",
+    "thai_abbr_months",
+    "thai_abbr_weekdays",
+    "thai_full_months",
+    "thai_full_weekdays",
+    "thai_strptime",
+    "thaiword_to_date",
+]
+
+import re
+from datetime import datetime, timedelta
+from typing import Union
+
+try:
+    from zoneinfo import ZoneInfo
+except ImportError:
+    from backports.zoneinfo import ZoneInfo
+
+
+thai_abbr_weekdays = ["จ", "อ", "พ", "พฤ", "ศ", "ส", "อา"]
+thai_full_weekdays = [
+    "วันจันทร์",
+    "วันอังคาร",
+    "วันพุธ",
+    "วันพฤหัสบดี",
+    "วันศุกร์",
+    "วันเสาร์",
+    "วันอาทิตย์",
+]
+
+thai_abbr_months = [
+    "ม.ค.",
+    "ก.พ.",
+    "มี.ค.",
+    "เม.ย.",
+    "พ.ค.",
+    "มิ.ย.",
+    "ก.ค.",
+    "ส.ค.",
+    "ก.ย.",
+    "ต.ค.",
+    "พ.ย.",
+    "ธ.ค.",
+]
+thai_full_months = [
+    "มกราคม",
+    "กุมภาพันธ์",
+    "มีนาคม",
+    "เมษายน",
+    "พฤษภาคม",
+    "มิถุนายน",
+    "กรกฎาคม",
+    "สิงหาคม",
+    "กันยายน",
+    "ตุลาคม",
+    "พฤศจิกายน",
+    "ธันวาคม",
+]
+thai_full_month_lists = [
+    ["มกราคม", "มกรา", "ม.ค.", "01", "1"],
+    ["กุมภาพันธ์", "กุมภา", "ก.พ.", "02", "2"],
+    ["มีนาคม", "มีนา", "มี.ค.", "03", "3"],
+    ["เมษายน", "เมษา", "เม.ย.", "04", "4"],
+    ["พฤษภาคม", "พฤษภา", "พ.ค.", "05", "5"],
+    ["มิถุนายน", "มิถุนา", "มิ.ย.", "06", "6"],
+    ["กรกฎาคม", "ก.ค.", "07", "7"],
+    ["สิงหาคม", "สิงหา", "ส.ค.", "08", "8"],
+    ["กันยายน", "กันยา", "ก.ย.", "09", "9"],
+    ["ตุลาคม", "ตุลา", "ต.ค.", "10"],
+    ["พฤศจิกายน", "พฤศจิกา", "พ.ย.", "11"],
+    ["ธันวาคม", "ธันวา", "ธ.ค.", "12"]
+]
+thai_full_month_lists_regex = "(" + '|'.join(
+    ['|'.join(i) for i in thai_full_month_lists]
+) + ")"
+year_all_regex = r"(\d\d\d\d|\d\d)"
+dates_list = "(" + '|'.join(
+    [str(i) for i in range(32, 0, -1)] + [
+        "0" + str(i) for i in range(1, 10)
+    ]
+) + ")"
+
+_DAY = {
+    "วันนี้": 0,
+    "คืนนี้": 0,
+    "พรุ่งนี้": 1,
+    "วันพรุ่งนี้": 1,
+    "คืนถัดจากนี้": 1,
+    "คืนหน้า": 1,
+    "มะรืน": 2,
+    "มะรืนนี้": 2,
+    "วันมะรืนนี้": 2,
+    "ถัดจากพรุ่งนี้": 2,
+    "ถัดจากวันพรุ่งนี้": 2,
+    "เมื่อวาน": -1,
+    "เมื่อวานนี้": -1,
+    "วานนี้": -1,
+    "เมื่อคืน": -1,
+    "เมื่อคืนนี้": -1,
+    "วานซืน": -2,
+    "เมื่อวานซืน": -2,
+    "เมื่อวานของเมื่อวาน": -2,
+}
+
+
+
+[docs] +def convert_years(year: str, src="be", target="ad") -> str: + """ + Convert years + + :param int year: Year + :param str src: The source year + :param str target: The target year + :return: The converted year + :rtype: str + + **Options for year** + * *be* - Buddhist calendar + * *ad* - Anno Domini + * *re* - Rattanakosin era + * *ah* - Anno Hejira + + **Warning**: This function works properly only after 1941 \ + because Thailand has change the Thai calendar in 1941. + If you are the time traveler or the historian, \ + you should care about the correct calendar. + """ + output_year = None + if src == "be": + # พ.ศ. - 543  = ค.ศ. + if target == "ad": + output_year = str(int(year) - 543) + # พ.ศ. - 2324 = ร.ศ.  + elif target == "re": + output_year = str(int(year) - 2324) + # พ.ศ. - 1122 = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) - 1122) + elif src == "ad": + # ค.ศ. + 543 = พ.ศ. + if target == "be": + output_year = str(int(year) + 543) + # ค.ศ. + 543 - 2324 = ร.ศ. + elif target == "re": + output_year = str(int(year) + 543 - 2324) + # ค.ศ. +543- 1122   = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) + 543 - 1122) + elif src == "re": + # ร.ศ. + 2324 = พ.ศ. + if target == "be": + output_year = str(int(year) + 2324) + # ร.ศ. + 2324 - 543  = ค.ศ. + elif target == "ad": + output_year = str(int(year) + 2324 - 543) + # ร.ศ. + 2324 - 1122  = ฮ.ศ. + elif target == "ah": + output_year = str(int(year) + 2324 - 1122) + elif src == "ah": + # ฮ.ศ. + 1122 = พ.ศ. + if target == "be": + output_year = str(int(year) + 1122) + # ฮ.ศ. +1122 - 543= ค.ศ. + elif target == "ad": + output_year = str(int(year) + 1122 - 543) + # ฮ.ศ. +1122 - 2324 = ร.ศ. + elif target == "re": + output_year = str(int(year) + 1122 - 2324) + if output_year is None: + raise NotImplementedError( + f"This function doesn't support {src} to {target}" + ) + return output_year
+ + + +def _find_month(text): + for i, m in enumerate(thai_full_month_lists): + for j in m: + if j in text: + return i + 1 + + +
+[docs] +def thai_strptime( + text: str, + fmt: str, + year: str = "be", + add_year: int = None, + tzinfo=ZoneInfo("Asia/Bangkok") +): + """ + Thai strptime + + :param str text: text + :param str fmt: string containing date and time directives + :param str year: year of the text \ + (ad is Anno Domini and be is Buddhist Era) + :param int add_year: add to year when converting to ad + :param object tzinfo: tzinfo (default is Asia/Bangkok) + :return: The year that is converted to datetime.datetime + :rtype: datetime.datetime + + The fmt chars that are supported: + * *%d* - Day (1 - 31) + * *%B* - Thai month (03, 3, มี.ค., or มีนาคม) + * *%Y* - Year (66, 2566, or 2023) + * *%H* - Hour (0 - 23) + * *%M* - Minute (0 - 59) + * *%S* - Second (0 - 59) + * *%f* - Microsecond + + :Example: + :: + + from pythainlp.util import thai_strptime + + thai_strptime("15 ก.ค. 2565 09:00:01","%d %B %Y %H:%M:%S") + # output: + # datetime.datetime( + # 2022, + # 7, + # 15, + # 9, + # 0, + # 1, + # tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok') + # ) + """ + d = "" + m = "" + y = "" + fmt = fmt.replace("%-m", "%m") + fmt = fmt.replace("%-d", "%d") + fmt = fmt.replace("%b", "%B") + fmt = fmt.replace("%-y", "%y") + data = {} + _old = fmt + if "%d" in fmt: + fmt = fmt.replace("%d", dates_list) + if "%B" in fmt: + fmt = fmt.replace("%B", thai_full_month_lists_regex) + if "%Y" in fmt: + fmt = fmt.replace("%Y", year_all_regex) + if "%H" in fmt: + fmt = fmt.replace("%H", r"(\d\d|\d)") + if "%M" in fmt: + fmt = fmt.replace("%M", r"(\d\d|\d)") + if "%S" in fmt: + fmt = fmt.replace("%S", r"(\d\d|\d)") + if "%f" in fmt: + fmt = fmt.replace("%f", r"(\d+)") + keys = [ + i.strip().strip('-').strip(':').strip('.') + for i in _old.split("%") if i != '' + ] + y = re.findall(fmt, text) + + data = {i: ''.join(list(j)) for i, j in zip(keys, y[0])} + H = 0 + M = 0 + S = 0 + f = 0 + d = data['d'] + m = _find_month(data['B']) + y = data['Y'] + if "H" in keys: + H = data['H'] + if "M" in keys: + M = data['M'] + if "S" in keys: + S = data['S'] + if "f" in keys: + f = data['f'] + if int(y) < 100 and year == "be": + if add_year is None: + y = str(2500 + int(y)) + else: + y = str(int(add_year) + int(y)) + elif int(y) < 100 and year == "ad": + if add_year is None: + y = str(2000 + int(y)) + else: + y = str(int(add_year) + int(y)) + if year == "be": + y = convert_years(y, src="be", target="ad") + return datetime( + year=int(y), + month=int(m), + day=int(d), + hour=int(H), + minute=int(M), + second=int(S), + microsecond=int(f), + tzinfo=tzinfo + )
+ + + +
+[docs] +def now_reign_year() -> int: + """ + Return the reign year of the 10th King of Chakri dynasty. + + :return: reign year of the 10th King of Chakri dynasty. + :rtype: int + + :Example: + :: + + from pythainlp.util import now_reign_year + + text = "เป็นปีที่ {reign_year} ในรัชกาลปัจจุบัน"\\ + .format(reign_year=now_reign_year()) + + print(text) + # output: เป็นปีที่ 4 ในรัชการปัจจุบัน + """ + now_ = datetime.now() + return now_.year - 2015
+ + + +
+[docs] +def reign_year_to_ad(reign_year: int, reign: int) -> int: + """ + Convert reign year to AD. + + Return AD year according to the reign year for + the 7th to 10th King of Chakri dynasty, Thailand. + For instance, the AD year of the 4th reign year of the 10th King is 2019. + + :param int reign_year: reign year of the King + :param int reign: the reign of the King (i.e. 7, 8, 9, and 10) + + :return: the year in AD of the King given the reign and reign year. + :rtype: int + + :Example: + :: + + from pythainlp.util import reign_year_to_ad + + print("The 4th reign year of the King Rama X is in", \\ + reign_year_to_ad(4, 10)) + # output: The 4th reign year of the King Rama X is in 2019 + + print("The 1st reign year of the King Rama IX is in", \\ + reign_year_to_ad(1, 9)) + # output: The 4th reign year of the King Rama X is in 1946 + """ + if int(reign) == 10: + ad = int(reign_year) + 2015 + elif int(reign) == 9: + ad = int(reign_year) + 1945 + elif int(reign) == 8: + ad = int(reign_year) + 1928 + elif int(reign) == 7: + ad = int(reign_year) + 1924 + return ad
+ + + +
+[docs] +def thaiword_to_date( + text: str, date: datetime = None +) -> Union[datetime, None]: + """ + Convert Thai relative date to :class:`datetime.datetime`. + + :param str text: Thai text containing relative date + :param datetime.datetime date: date (default is datetime.datetime.now()) + + :return: datetime object, if it can be calculated. Otherwise, None. + :rtype: datetime.datetime + + :Example: + + thaiword_to_date("พรุ่งนี้") + # output: + # datetime of tomorrow + """ + if text not in _DAY: + return None + + day_num = _DAY.get(text) + + if not date: + date = datetime.now() + + return date + timedelta(days=day_num)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/digitconv.html b/5.1/_modules/pythainlp/util/digitconv.html new file mode 100644 index 0000000..3b00f9c --- /dev/null +++ b/5.1/_modules/pythainlp/util/digitconv.html @@ -0,0 +1,353 @@ + + + + + + + + pythainlp.util.digitconv — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.digitconv

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Convert digits
+"""
+
+_arabic_thai = {
+    "0": "๐",
+    "1": "๑",
+    "2": "๒",
+    "3": "๓",
+    "4": "๔",
+    "5": "๕",
+    "6": "๖",
+    "7": "๗",
+    "8": "๘",
+    "9": "๙",
+}
+
+_thai_arabic = {
+    "๐": "0",
+    "๑": "1",
+    "๒": "2",
+    "๓": "3",
+    "๔": "4",
+    "๕": "5",
+    "๖": "6",
+    "๗": "7",
+    "๘": "8",
+    "๙": "9",
+}
+
+_digit_spell = {
+    "0": "ศูนย์",
+    "1": "หนึ่ง",
+    "2": "สอง",
+    "3": "สาม",
+    "4": "สี่",
+    "5": "ห้า",
+    "6": "หก",
+    "7": "เจ็ด",
+    "8": "แปด",
+    "9": "เก้า",
+}
+
+_spell_digit = {
+    "ศูนย์": "0",
+    "หนึ่ง": "1",
+    "สอง": "2",
+    "สาม": "3",
+    "สี่": "4",
+    "ห้า": "5",
+    "หก": "6",
+    "เจ็ด": "7",
+    "แปด": "8",
+    "เก้า": "9",
+}
+
+_arabic_thai_translate_table = str.maketrans(_arabic_thai)
+_thai_arabic_translate_table = str.maketrans(_thai_arabic)
+_digit_spell_translate_table = str.maketrans(_digit_spell)
+
+
+
+[docs] +def thai_digit_to_arabic_digit(text: str) -> str: + """ + This function converts Thai digits (i.e. ๑, ๓, ๑๐) to Arabic digits + (i.e. 1, 3, 10). + + :param str text: Text with Thai digits such as '๑', '๒', '๓' + :return: Text with Thai digits converted to Arabic digits + such as '1', '2', '3' + :rtype: str + + :Example: + :: + + from pythainlp.util import thai_digit_to_arabic_digit + + text = 'เป็นจำนวน ๑๒๓,๔๐๐.๒๕ บาท' + + thai_digit_to_arabic_digit(text) + # output: เป็นจำนวน 123,400.25 บาท + """ + if not text or not isinstance(text, str): + return "" + + return text.translate(_thai_arabic_translate_table)
+ + + +
+[docs] +def arabic_digit_to_thai_digit(text: str) -> str: + """ + This function converts Arabic digits (i.e. 1, 3, 10) to Thai digits + (i.e. ๑, ๓, ๑๐). + + :param str text: Text with Arabic digits such as '1', '2', '3' + :return: Text with Arabic digits converted to Thai digits + such as '๑', '๒', '๓' + :rtype: str + + :Example: + :: + + from pythainlp.util import arabic_digit_to_thai_digit + + text = 'เป็นจำนวน 123,400.25 บาท' + + arabic_digit_to_thai_digit(text) + # output: เป็นจำนวน ๑๒๓,๔๐๐.๒๕ บาท + """ + if not text or not isinstance(text, str): + return "" + + # Convert Arabic to Thai numerals + return text.translate(_arabic_thai_translate_table)
+ + + +
+[docs] +def digit_to_text(text: str) -> str: + """ + :param str text: Text with digits such as '1', '2', '๓', '๔' + :return: Text with digits spelled out in Thai + """ + if not text or not isinstance(text, str): + return "" + + # Convert Thai numerals to Arabic ones + text = text.translate(_thai_arabic_translate_table) + # Spell out Arabic numerals in Thai text + text = text.translate(_digit_spell_translate_table) + return text
+ + + +
+[docs] +def text_to_arabic_digit(text: str) -> str: + """ + This function converts spelled out digits in Thai to Arabic digits. + + :param text: A digit spelled out in Thai + :return: An Arabic digit such as '1', '2', '3' if the text is + digit spelled out in Thai (ศูนย์, หนึ่ง, สอง, ..., เก้า). + Otherwise, it returns an empty string. + :rtype: str + + :Example: + :: + + from pythainlp.util import text_to_arabic_digit + + text_to_arabic_digit("ศูนย์") + # output: 0 + text_to_arabic_digit("หนึ่ง") + # output: 1 + text_to_arabic_digit("แปด") + # output: 8 + text_to_arabic_digit("เก้า") + # output: 9 + + # For text that is not digit spelled out in Thai + text_to_arabic_digit("สิบ") == "" + # output: True + text_to_arabic_digit("เก้าร้อย") == "" + # output: True + """ + if not text or text not in _spell_digit: + return "" + + return _spell_digit[text]
+ + + +
+[docs] +def text_to_thai_digit(text: str) -> str: + """ + This function converts spelled out digits in Thai to Thai digits. + + :param text: A digit spelled out in Thai + :return: A Thai digit such as '๑', '๒', '๓' if the text is digit + spelled out in Thai (ศูนย์, หนึ่ง, สอง, ..., เก้า). + Otherwise, it returns an empty string. + :rtype: str + + :Example: + :: + + from pythainlp.util import text_to_thai_digit + + text_to_thai_digit("ศูนย์") + # output: ๐ + text_to_thai_digit("หนึ่ง") + # output: ๑ + text_to_thai_digit("แปด") + # output: ๘ + text_to_thai_digit("เก้า") + # output: ๙ + + # For text that is not Thai digit spelled out + text_to_thai_digit("สิบ") == "" + # output: True + text_to_thai_digit("เก้าร้อย") == "" + # output: True + """ + return arabic_digit_to_thai_digit(text_to_arabic_digit(text))
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/emojiconv.html b/5.1/_modules/pythainlp/util/emojiconv.html new file mode 100644 index 0000000..fa92003 --- /dev/null +++ b/5.1/_modules/pythainlp/util/emojiconv.html @@ -0,0 +1,2005 @@ + + + + + + + + pythainlp.util.emojiconv — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.emojiconv

+# -*- coding_utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Convert emojis
+"""
+
+import re
+
+_emoji_th = {
+    "😀": "หน้ายิ้มยิงฟัน",
+    "😁": "ยิ้มยิงฟันตายิ้ม",
+    "😂": "ร้องไห้ดีใจ",
+    "😃": "หน้ายิ้มอ้าปาก",
+    "😄": "หน้ายิ้มตายิ้ม_อ้าปาก",
+    "😅": "ยิ้มเหงื่อตก",
+    "😆": "ยิ้มตาหยี",
+    "😇": "ยิ้มเทวดา",
+    "😉": "ขยิบตา",
+    "😊": "หน้ายิ้มตายิ้ม",
+    "🙂": "หน้ายิ้มบางๆ",
+    "🙃": "หน้ากลับหัว",
+    "🤣": "ขำกลิ้ง",
+    "☺": "หน้ายิ้ม",
+    "😍": "ตาหัวใจ",
+    "😗": "หน้าจุ๊บ",
+    "😘": "หน้าส่งจุ๊บ",
+    "😙": "หน้ายิ้มส่งจุ๊บ",
+    "😚": "หยีตาส่งจุ๊บ",
+    "🤩": "หน้าตาเป็นประกาย",
+    "🥰": "หน้ายิ้มพร้อมกับหัวใจหลายดวง",
+    "🥲": "ใบหน้ายิ้มทั้งน้ำตา",
+    "😋": "แลบลิ้นมุมปาก",
+    "😛": "แลบลิ้น",
+    "😜": "แลบลิ้นหน้าทะเล้น",
+    "😝": "แลบลิ้นตาหยี",
+    "🤑": "หน้าเห็นแก่เงิน",
+    "🤪": "หน้าเพี้ยน",
+    "🤔": "หน้าครุ่นคิด",
+    "🤗": "ยิ้มกอด",
+    "🤫": "หน้าบอกให้เงียบ",
+    "🤭": "เอามือปิดปาก",
+    "😏": "แสยะยิ้ม",
+    "😐": "หน้าเฉยๆ",
+    "😑": "หน้าเย็นชา",
+    "😒": "หน้าหน่าย",
+    "😬": "เบะปาก",
+    "😶": "หน้าไม่มีปาก",
+    "🙄": "กลอกตา",
+    "🤐": "รูดซิปปาก",
+    "🤥": "ขี้โกหก",
+    "🤨": "หน้าประหลาดใจกับยักคิ้ว",
+    "😌": "โล่งใจ",
+    "😔": "คิดไม่ตก",
+    "😪": "ง่วง",
+    "😴": "หลับ",
+    "🤤": "น้ำลายไหล",
+    "😵": "หน้ามึน",
+    "😷": "ผ้าคาดปาก",
+    "🤒": "อมปรอท",
+    "🤕": "หัวแตก",
+    "🤢": "หน้าเขียว",
+    "🤧": "จาม",
+    "🤮": "หน้าอาเจียน",
+    "🤯": "ช็อค",
+    "🥴": "หน้างงงวย",
+    "🥵": "ร้อนมาก",
+    "🥶": "หนาวสั่น",
+    "🤠": "คาวบอยสวมหมวก",
+    "🥳": "ไปปาร์ตี้",
+    "🥸": "ปลอมตัว",
+    "😎": "หน้ายิ้มใส่แว่น",
+    "🤓": "เด็กเนิร์ด",
+    "🧐": "หน้ากับแว่นเลนส์เดียว",
+    "☹": "หน้าบึ้ง",
+    "😓": "เหงื่อตก",
+    "😕": "หน้าสับสน",
+    "😖": "หน้ารำคาญ",
+    "😞": "หน้าผิดหวัง",
+    "😟": "หน้ากังวล",
+    "😢": "ร้องไห้",
+    "😣": "อดทน",
+    "😥": "โล่งอก",
+    "😦": "หน้าบึ้งอ้าปาก",
+    "😧": "หน้าเจ็บปวด",
+    "😨": "หวาดกลัว",
+    "😩": "หน้าอิดโรย",
+    "😫": "เหนื่อย",
+    "😭": "ร้องไห้โฮ",
+    "😮": "อ้าปาก",
+    "😯": "หน้าจุ๊ๆ",
+    "😰": "กังวลเหงื่อตก",
+    "😱": "กลัวกรีดร้อง",
+    "😲": "หน้าประหลาดใจ",
+    "😳": "อายหน้าแดง",
+    "🙁": "หน้าบึ้งเล็กน้อย",
+    "🥱": "หน้ากำลังหาว",
+    "🥺": "หน้าอ้อนวอน",
+    "☠": "กะโหลกไขว้",
+    "👿": "หน้าบึ้งมีเขา",
+    "💀": "หัวกระโหลก",
+    "😈": "ยิ้มมีเขา",
+    "😠": "หน้าโกรธ",
+    "😡": "โกรธมาก",
+    "😤": "หน้าข่มอารมณ์",
+    "🤬": "หน้ากำลังด่า",
+    "👹": "ยักษ์ญี่ปุ่น",
+    "👺": "ปีศาจญี่ปุ่น",
+    "👻": "ผี",
+    "👽": "เอเลี่ยน",
+    "👾": "สัตว์ประหลาดเอเลี่ยน",
+    "💩": "อุนจิ",
+    "🤖": "หุ่นยนต์",
+    "🤡": "หน้าตลก",
+    "😸": "แมวยิ้มอ้าปาก_ยิ้มออกตา",
+    "😹": "แมวร้องไห้ดีใจ",
+    "😺": "แมวยิ้มอ้าปาก",
+    "😻": "แมวยิ้มมีตารูปหัวใจ",
+    "😼": "แมวยิ้มเจ้าเล่ห์",
+    "😽": "แมวส่งจุ๊บ",
+    "😾": "แมวโกรธ",
+    "😿": "แมวร้องไห้",
+    "🙀": "แมวตกใจ",
+    "🙈": "ลิงปิดตา",
+    "🙉": "ลิงปิดหู",
+    "🙊": "ลิงปิดปาก",
+    "❣": "เครื่องหมายอัศเจรีย์รูปหัวใจ",
+    "❤": "หัวใจสีแดง",
+    "💋": "รอยจูบ",
+    "💌": "จดหมายรัก",
+    "💓": "หัวใจเต้น",
+    "💔": "อกหัก",
+    "💕": "ใจ_2_ดวง",
+    "💖": "หัวใจวิบวับ",
+    "💗": "ใจพองโต",
+    "💘": "ศรปักใจ",
+    "💙": "หัวใจสีน้ำเงิน",
+    "💚": "หัวใจสีเขียว",
+    "💛": "หัวใจสีเหลือง",
+    "💜": "หัวใจสีม่วง",
+    "💝": "หัวใจผูกริบบิ้น",
+    "💞": "หัวใจโคจร",
+    "💟": "หัวใจประดับ",
+    "💢": "สัญลักษณ์ความโกรธ",
+    "💣": "ระเบิด",
+    "💤": "หลับปุ๋ย",
+    "💥": "การปะทะ",
+    "💦": "เหงื่อหยด",
+    "💨": "วิ่งฉิว",
+    "💫": "มึนหัว",
+    "💬": "พูดไม่ออก",
+    "💭": "ลูกโป่งความคิด",
+    "💯": "คะแนนเต็ม",
+    "🕳": "หลุม",
+    "🖤": "ใจดำ",
+    "🗨": "ฟองคำพูด",
+    "🗯": "ฟองคำพูดรุนแรง",
+    "🤍": "หัวใจสีขาว",
+    "🤎": "หัวใจสีน้ำตาล",
+    "🧡": "หัวใจสีส้ม",
+    "✋": "ตั้งฝ่ามือ",
+    "👋": "โบกมือ",
+    "🖐": "ชูมือกางนิ้ว",
+    "🖖": "ชูนิ้วแบบวัลแคน",
+    "🤚": "ยกมือ",
+    "✌": "ชู_2_นิ้ว",
+    "👌": "ทำมือโอเค",
+    "🤌": "หุบนิ้ว",
+    "🤏": "ทำมือบีบนิ้วเข้าหากัน",
+    "🤘": "ชูนิ้วชาวร็อก",
+    "🤙": "มือโทร",
+    "🤞": "นิ้วไขว้",
+    "🤟": "ทำมือ_‘ฉันรักเธอ’",
+    "☝": "นิ้วชี้ขึ้น",
+    "👆": "หลังมือนิ้วชี้ขึ้น",
+    "👇": "นิ้วชี้ลง",
+    "👈": "นิ้วชี้ทางซ้าย",
+    "👉": "นิ้วชี้ทางขวา",
+    "🖕": "ชูนิ้วกลาง",
+    "✊": "กำมือ",
+    "👊": "กำปั้น",
+    "👍": "ชูนิ้วโป้งขึ้น",
+    "👎": "คว่ำนิ้วโป้งลง",
+    "🤛": "กำปั้นขวา",
+    "🤜": "กำปั้นซ้าย",
+    "👏": "ตบมือ",
+    "👐": "แบมือ",
+    "🙌": "ชู_2_มือ",
+    "🙏": "พนมมือ",
+    "🤝": "จับมือ",
+    "🤲": "แบสองมือ",
+    "✍": "เขียนหนังสือ",
+    "💅": "สีทาเล็บ",
+    "🤳": "เซลฟี่",
+    "👀": "ตา_2_ข้าง",
+    "👁": "ตาข้างเดียว",
+    "👂": "หู",
+    "👃": "จมูก",
+    "👄": "ปาก",
+    "👅": "ลิ้น",
+    "💪": "เบ่งกล้าม",
+    "🦴": "กระดูก",
+    "🦵": "ขา",
+    "🦶": "เท้า",
+    "🦷": "ฟัน",
+    "🦻": "หูใส่อุปกรณ์ช่วยฟัง",
+    "🦾": "แขนกล",
+    "🦿": "ขากล",
+    "🧠": "สมอง",
+    "🫀": "หัวใจ",
+    "🫁": "ปอด",
+    "👦": "เด็กชาย",
+    "👧": "เด็กหญิง",
+    "👨": "ผู้ชาย",
+    "👩": "ผู้หญิง",
+    "👱": "คนผมทอง",
+    "👴": "ชายแก่",
+    "👵": "หญิงแก่",
+    "👶": "ทารก",
+    "🧑": "คน",
+    "🧒": "เด็ก",
+    "🧓": "คนชรา",
+    "🧔": "ผู้ชายมีเครา",
+    "💁": "โต๊ะสอบถาม",
+    "🙅": "มือทำท่าไม่โอเค",
+    "🙆": "ทำท่าโอเค",
+    "🙇": "ท่าขอโทษ",
+    "🙋": "ยกมือขึ้น",
+    "🙍": "ขมวดคิ้ว",
+    "🙎": "ปากยื่น",
+    "🤦": "หมดกัน",
+    "🤷": "ยักไหล่",
+    "🧏": "คนหูหนวก",
+    "👮": "เจ้าหน้าที่ตำรวจ",
+    "👰": "เจ้าสาว",
+    "👲": "ชายจีน",
+    "👳": "ชายโพกหัว",
+    "👷": "คนงานก่อสร้าง",
+    "👸": "เจ้าหญิง",
+    "💂": "การ์ดคุ้มกัน",
+    "🕵": "นักสืบ",
+    "🤰": "คนท้อง",
+    "🤱": "ให้นม",
+    "🤴": "เจ้าชาย",
+    "🤵": "คนหล่อ",
+    "🥷": "นินจา",
+    "🧕": "ผู้หญิงโพกศีรษะ",
+    "🎅": "ซานต้า",
+    "👼": "นางฟ้าเด็ก",
+    "🤶": "นางซานต้า",
+    "🦸": "ซุปเปอร์ฮีโร่",
+    "🦹": "ยอดมนุษย์",
+    "🧙": "นักเวทย์",
+    "🧚": "นางฟ้า",
+    "🧛": "แวมไพร์",
+    "🧜": "ครึ่งคนครึ่งปลา",
+    "🧝": "เอลฟ์",
+    "🧞": "ยักษ์จีนี่",
+    "🧟": "ซอมบี้",
+    "🏃": "คนวิ่ง",
+    "👯": "คนในชุดหูกระต่าย",
+    "💃": "นักเต้น",
+    "💆": "นวดหน้า",
+    "💇": "ตัดผม",
+    "🕴": "คนใส่สูทลอยได้",
+    "🕺": "คนเต้น",
+    "🚶": "คนเดิน",
+    "🧍": "คนกำลังยืน",
+    "🧎": "คนกำลังคุกเข่า",
+    "🧖": "คนในห้องอบไอน้ำ",
+    "🧗": "นักไต่เขา",
+    "⛷": "นักสกี",
+    "⛹": "คนเล่นบอล",
+    "🏂": "นักสโนว์บอร์ด",
+    "🏄": "นักโต้คลื่น",
+    "🏇": "แข่งม้า",
+    "🏊": "นักว่ายน้ำ",
+    "🏋": "นักยกน้ำหนัก",
+    "🏌": "นักกอล์ฟ",
+    "🚣": "นักพายเรือ",
+    "🚴": "นักปั่นจักรยาน",
+    "🚵": "นักปั่นจักรยานเสือภูเขา",
+    "🤸": "คนตีลังกา",
+    "🤹": "คนเล่นจั๊กกลิ้ง",
+    "🤺": "นักฟันดาบ",
+    "🤼": "นักมวยปล้ำ",
+    "🤽": "นักโปโลน้ำ",
+    "🤾": "นักแฮนด์บอล",
+    "🛀": "คนนอนแช่น้ำในอ่าง",
+    "🛌": "คนนอนหลับ",
+    "🧘": "คนนั่งสมาธิ",
+    "👪": "ครอบครัว",
+    "👫": "ชาย-หญิงจับมือ",
+    "👬": "ชาย-ชายจับมือ",
+    "👭": "หญิง-หญิงจับมือ",
+    "💏": "จูบ",
+    "💑": "คู่รัก",
+    "👣": "รอยเท้า",
+    "👤": "เงาครึ่งตัวคนเดียว",
+    "👥": "เงาครึ่งตัว_2_คน",
+    "🗣": "เงาคนกำลังพูด",
+    "🫂": "คนกอดกัน",
+    "🏻": "โทนผิวสีขาว",
+    "🏼": "โทนผิวสีขาวเหลือง",
+    "🏽": "โทนผิวสีเหลือง",
+    "🏾": "โทนผิวสีแทน",
+    "🏿": "โทนผิวสีเข้ม",
+    "🦰": "ผมแดง",
+    "🦱": "ผมหยิก",
+    "🦲": "หัวล้าน",
+    "🦳": "ผมขาว",
+    "🐀": "หนูตัวใหญ่",
+    "🐁": "หนูตัวเล็ก",
+    "🐂": "วัวตัวผู้",
+    "🐃": "ควาย",
+    "🐄": "วัว",
+    "🐅": "เสือ",
+    "🐆": "เสือดาว",
+    "🐇": "กระต่าย",
+    "🐈": "แมว",
+    "🐎": "ม้า",
+    "🐏": "แกะตัวผู้",
+    "🐐": "แพะ",
+    "🐑": "แกะ",
+    "🐒": "ลิง",
+    "🐕": "สุนัข",
+    "🐖": "หมู",
+    "🐗": "หมูป่าตัวผู้",
+    "🐘": "ช้าง",
+    "🐨": "โคอาล่า",
+    "🐩": "พุดเดิ้ล",
+    "🐪": "อูฐโหนกเดียว",
+    "🐫": "อูฐสองโหนก",
+    "🐭": "หน้าหนู",
+    "🐮": "หน้าวัว",
+    "🐯": "หน้าเสือ",
+    "🐰": "หน้ากระต่าย",
+    "🐱": "หน้าแมว",
+    "🐴": "หน้าม้า",
+    "🐵": "หน้าลิง",
+    "🐶": "หน้าสุนัข",
+    "🐷": "หน้าหมู",
+    "🐹": "หนูแฮมสเตอร์",
+    "🐺": "หมาป่า",
+    "🐻": "หมี",
+    "🐼": "แพนด้า",
+    "🐽": "จมูกหมู",
+    "🐾": "รอยเท้าสัตว์",
+    "🐿": "ชิปมังก์",
+    "🦁": "สิงโต",
+    "🦄": "ยูนิคอร์น",
+    "🦇": "ค้างคาว",
+    "🦊": "จิ้งจอก",
+    "🦌": "กวาง",
+    "🦍": "กอริลล่า",
+    "🦏": "แรด",
+    "🦒": "ยีราฟ",
+    "🦓": "ม้าลาย",
+    "🦔": "เฮดจ์ฮ็อก",
+    "🦘": "จิงโจ้",
+    "🦙": "ลามะ",
+    "🦛": "ฮิปโปโปเตมัส",
+    "🦝": "แรคคูน",
+    "🦡": "แบดเจอร์",
+    "🦣": "ช้างแมมมอธ",
+    "🦥": "สลอธ",
+    "🦦": "ตัวนาก",
+    "🦧": "อุรังอุตัง",
+    "🦨": "สกังก์",
+    "🦫": "บีเวอร์",
+    "🦬": "ควายไบซัน",
+    "🦮": "สุนัขนำทาง",
+    "🐓": "ไก่ตัวผู้",
+    "🐔": "ไก่",
+    "🐣": "ลูกเจี๊ยบออกจากไข่",
+    "🐤": "ลูกเจี๊ยบ",
+    "🐥": "ลูกเจี๊ยบยืนหันหน้า",
+    "🐦": "นก",
+    "🐧": "เพนกวิน",
+    "🕊": "นกพิราบขาว",
+    "🦃": "ไก่งวง",
+    "🦅": "อินทรี",
+    "🦆": "เป็ด",
+    "🦉": "นกฮูก",
+    "🦚": "นกยูง",
+    "🦜": "นกแก้ว",
+    "🦢": "หงส์",
+    "🦤": "นกโดโด",
+    "🦩": "นกฟลามิงโก",
+    "🪶": "ขนนก",
+    "🐸": "กบ",
+    "🐉": "มังกร",
+    "🐊": "จระเข้",
+    "🐍": "งู",
+    "🐢": "เต่า",
+    "🐲": "หน้ามังกร",
+    "🦎": "จิ้งจก",
+    "🦕": "ไดโนเสาร์",
+    "🦖": "ทีเร็กซ์",
+    "🐋": "ปลาวาฬ",
+    "🐙": "ปลาหมึกยักษ์",
+    "🐚": "หอย",
+    "🐟": "ปลา",
+    "🐠": "ปลาเขตร้อน",
+    "🐡": "ปลาปักเป้า",
+    "🐬": "ปลาโลมา",
+    "🐳": "ปลาวาฬพ่นน้ำ",
+    "🦈": "ฉลาม",
+    "🦭": "แมวน้ำ",
+    "🐌": "หอยทาก",
+    "🐛": "แมลง",
+    "🐜": "มด",
+    "🐝": "ผึ้ง",
+    "🐞": "เต่าทอง",
+    "🕷": "แมงมุม",
+    "🕸": "ใยแมงมุม",
+    "🦂": "แมงป่อง",
+    "🦋": "ผีเสื้อ",
+    "🦗": "จิ้งหรีด",
+    "🦟": "ยุง",
+    "🦠": "จุลินทรีย์",
+    "🪰": "แมลงวัน",
+    "🪱": "หนอน",
+    "🪲": "ด้วง",
+    "🪳": "แมลงสาบ",
+    "🌷": "ทิวลิป",
+    "🌸": "ดอกซากุระ",
+    "🌹": "ดอกกุหลาบ",
+    "🌺": "ดอกชบา",
+    "🌻": "ดอกทานตะวัน",
+    "🌼": "ดอกไม้บาน",
+    "🏵": "ลายดอกกุหลาบ",
+    "💐": "ช่อดอกไม้",
+    "💮": "ตราดอกไม้",
+    "🥀": "ดอกไม้เหี่ยว",
+    "☘": "ใบโคลเวอร์",
+    "🌱": "ต้นอ่อน",
+    "🌲": "ต้นสน",
+    "🌳": "ต้นไม้ร่มรื่น",
+    "🌴": "ต้นมะพร้าว",
+    "🌵": "ตะบองเพชร",
+    "🌾": "รวงข้าว",
+    "🌿": "สมุนไพร",
+    "🍀": "ใบโคลเวอร์_4_แฉก",
+    "🍁": "ใบเมเปิ้ล",
+    "🍂": "ใบไม้ร่วง",
+    "🍃": "ใบไม้ปลิว",
+    "🪴": "ไม้กระถาง",
+    "🍅": "มะเขือเทศ",
+    "🍇": "องุ่น",
+    "🍈": "เมลอน",
+    "🍉": "แตงโม",
+    "🍊": "ส้ม",
+    "🍋": "เลมอน",
+    "🍌": "กล้วย",
+    "🍍": "สับปะรด",
+    "🍎": "แอปเปิ้ลแดง",
+    "🍏": "แอปเปิ้ลเขียว",
+    "🍐": "ลูกแพร์",
+    "🍑": "ลูกพีช",
+    "🍒": "เชอร์รี่",
+    "🍓": "สตรอว์เบอร์รี่",
+    "🥝": "กีวี",
+    "🥥": "มะพร้าว",
+    "🥭": "มะม่วง",
+    "🫐": "บลูเบอร์รี่",
+    "🫒": "มะกอก",
+    "🌰": "เกาลัด",
+    "🌶": "พริก",
+    "🌽": "ข้าวโพด",
+    "🍄": "เห็ด",
+    "🍆": "มะเขือยาว",
+    "🥑": "อาโวคาโด",
+    "🥒": "แตงกวา",
+    "🥔": "มันฝรั่ง",
+    "🥕": "แครอท",
+    "🥜": "ถั่ว",
+    "🥦": "บรอกโคลี",
+    "🥬": "ผักใบเขียว",
+    "🧄": "กระเทียม",
+    "🧅": "หอมหัวใหญ่",
+    "🫑": "พริกหยวก",
+    "🌭": "ฮอทด็อก",
+    "🌮": "ทาโก้",
+    "🌯": "เบอร์ริโต",
+    "🍔": "แฮมเบอร์เกอร์",
+    "🍕": "พิซซ่า_1_ชิ้น",
+    "🍖": "เนื้อ",
+    "🍗": "น่องไก่",
+    "🍞": "ขนมปัง",
+    "🍟": "เฟรนช์ฟราย",
+    "🍲": "สตูว์",
+    "🍳": "ทำอาหาร",
+    "🍿": "ป๊อปคอร์น",
+    "🥐": "ครัวซอง",
+    "🥓": "เบคอน",
+    "🥖": "ขนมปังฝรั่งเศส",
+    "🥗": "สลัด",
+    "🥘": "กระทะ",
+    "🥙": "เคบับ",
+    "🥚": "ไข่",
+    "🥞": "แพนเค้ก",
+    "🥣": "ชามพร้อมช้อน",
+    "🥨": "เพรตเซล",
+    "🥩": "เนื้อหั่นชิ้น",
+    "🥪": "แซนด์วิช",
+    "🥫": "อาหารกระป๋อง",
+    "🥯": "เบเกิล",
+    "🧀": "เนยแข็ง",
+    "🧂": "เกลือ",
+    "🧆": "ฟาลาเฟล",
+    "🧇": "วาฟเฟิล",
+    "🧈": "เนย",
+    "🫓": "แฟลตเบรด",
+    "🫔": "ทามาเล่",
+    "🫕": "ฟองดูว์",
+    "🍘": "ข้าวอบกรอบ",
+    "🍙": "ข้าวปั้น",
+    "🍚": "ข้าวสวย",
+    "🍛": "ข้าวแกงกะหรี่",
+    "🍜": "ราเมน",
+    "🍝": "สปาเก็ตตี้",
+    "🍠": "มันเผา",
+    "🍡": "ดังโงะ",
+    "🍢": "โอเด้ง",
+    "🍣": "ซูชิ",
+    "🍤": "กุ้งทอด",
+    "🍥": "ลูกชิ้นปลา",
+    "🍱": "กล่องเบนโตะ",
+    "🥟": "เกี๊ยว",
+    "🥠": "คุกกี้เสี่ยงทาย",
+    "🥡": "อาหารกล่องซื้อกลับบ้าน",
+    "🥮": "ขนมไหว้พระจันทร์",
+    "🦀": "ปู",
+    "🦐": "กุ้ง",
+    "🦑": "หมึก",
+    "🦞": "กุ้งมังกร",
+    "🦪": "หอยนางรม",
+    "🍦": "ซอฟต์ครีม",
+    "🍧": "น้ำแข็งไส",
+    "🍨": "ไอศกรีม",
+    "🍩": "โดนัท",
+    "🍪": "คุกกี้",
+    "🍫": "ช็อกโกแลต",
+    "🍬": "ลูกอม",
+    "🍭": "อมยิ้ม",
+    "🍮": "คัสตาร์ด",
+    "🍯": "โถน้ำผึ้ง",
+    "🍰": "เค้ก",
+    "🎂": "เค้กวันเกิด",
+    "🥧": "พาย",
+    "🧁": "คัพเค้ก",
+    "☕": "เครื่องดื่มร้อน",
+    "🍵": "ถ้วยชา",
+    "🍶": "สาเก",
+    "🍷": "ไวน์",
+    "🍸": "ค็อกเทล",
+    "🍹": "เครื่องดื่มผสมน้ำผลไม้",
+    "🍺": "เบียร์",
+    "🍻": "เหยือกเบียร์ชนกัน",
+    "🍼": "ขวดนม",
+    "🍾": "แชมเปญ",
+    "🥂": "ชนแก้ว",
+    "🥃": "แก้วเหล้า",
+    "🥛": "แก้วนม",
+    "🥤": "แก้วพร้อมหลอด",
+    "🧃": "เครื่องดื่มแบบกล่อง",
+    "🧉": "ชามาเต",
+    "🧊": "ก้อนน้ำแข็ง",
+    "🧋": "ชาไข่มุก",
+    "🫖": "กาน้ำชา",
+    "🍴": "ส้อม_มีด",
+    "🍽": "จานพร้อมส้อม_มีด",
+    "🏺": "โถโบราณ",
+    "🔪": "มีดทำครัว",
+    "🥄": "ช้อน",
+    "🥢": "ตะเกียบ",
+    "🌍": "ลูกโลกแสดงทวีปยุโรป_แอฟริกา",
+    "🌎": "ลูกโลกแสดงทวีปอเมริกา",
+    "🌏": "ลูกโลกแสดงทวีปเอเชีย_ออสเตรเลีย",
+    "🌐": "ลูกโลกแสดงเส้นเมริเดียน",
+    "🗺": "แผนที่โลก",
+    "🗾": "แผนที่ญี่ปุ่น",
+    "🧭": "เข็มทิศ",
+    "⛰": "ภูเขา",
+    "🌋": "ภูเขาไฟ",
+    "🏔": "ภูเขามีหิมะ",
+    "🏕": "ตั้งแคมป์",
+    "🏖": "ร่มชายหาด",
+    "🏜": "ทะเลทราย",
+    "🏝": "เกาะ",
+    "🏞": "อุทยาน",
+    "🗻": "ภูเขาไฟฟูจิ",
+    "🏗": "ก่อสร้างอาคาร",
+    "🏘": "บ้านหลายหลัง",
+    "🏚": "บ้านร้าง",
+    "🏛": "อาคารสไตล์คลาสสิก",
+    "🏟": "สนามกีฬา",
+    "🏠": "บ้านเดี่ยว",
+    "🏡": "บ้านพร้อมสวน",
+    "🏢": "ตึกสำนักงาน",
+    "🏣": "ไปรษณีย์ญี่ปุ่น",
+    "🏤": "ไปรษณีย์ยุโรป",
+    "🏥": "โรงพยาบาล",
+    "🏦": "ธนาคาร",
+    "🏨": "โรงแรม",
+    "🏩": "ม่านรูด",
+    "🏪": "ร้านสะดวกซื้อ",
+    "🏫": "โรงเรียน",
+    "🏬": "ห้างสรรพสินค้า",
+    "🏭": "โรงงาน",
+    "🏯": "ปราสาทญี่ปุ่น",
+    "🏰": "ปราสาทยุโรป",
+    "💒": "งานแต่งงาน",
+    "🗼": "โตเกียวทาวเวอร์",
+    "🗽": "เทพีเสรีภาพ",
+    "🛖": "กระท่อม",
+    "🧱": "ก้อนอิฐ",
+    "🪨": "หิน",
+    "🪵": "ไม้",
+    "⛩": "ศาลเจ้าชินโต",
+    "⛪": "โบสถ์",
+    "🕋": "วิหารกะอ์บะฮ์",
+    "🕌": "มัสยิด",
+    "🕍": "โบสถ์ยิว",
+    "🛕": "วัดฮินดู",
+    "♨": "น้ำพุร้อน",
+    "⛲": "น้ำพุ",
+    "⛺": "เต็นท์",
+    "🌁": "หมอกลง",
+    "🌃": "ดาว",
+    "🌄": "ดวงอาทิตย์โผล่พ้นเขา",
+    "🌅": "ดวงอาทิตย์ขึ้น",
+    "🌆": "เมืองยามโพล้เพล้",
+    "🌇": "ดวงอาทิตย์ตก",
+    "🌉": "สะพานกลางคืน",
+    "🎠": "ม้าหมุน",
+    "🎡": "ชิงช้าสวรรค์",
+    "🎢": "รถไฟเหาะ",
+    "🎪": "ละครสัตว์",
+    "🏙": "หมู่ตึกสูง",
+    "💈": "ร้านตัดผม",
+    "⛽": "ปั๊มน้ำมัน",
+    "🏍": "มอเตอร์ไซค์",
+    "🏎": "รถแข่ง",
+    "🚂": "หัวรถจักรไอน้ำ",
+    "🚃": "ตู้รถไฟ",
+    "🚄": "ชินคันเซ็น",
+    "🚅": "รถไฟความเร็วสูง",
+    "🚆": "รถไฟ",
+    "🚇": "รถไฟใต้ดิน",
+    "🚈": "รถไฟรางเบา",
+    "🚉": "สถานีรถไฟ",
+    "🚊": "รถรางบนราง",
+    "🚋": "ตู้รถราง",
+    "🚌": "รถบัส",
+    "🚍": "รถบัสกำลังมา",
+    "🚎": "รถราง",
+    "🚏": "ป้ายรถบัส",
+    "🚐": "มินิบัส",
+    "🚑": "รถพยาบาล",
+    "🚒": "รถดับเพลิง",
+    "🚓": "รถตำรวจ",
+    "🚔": "รถตำรวจกำลังมา",
+    "🚕": "แท็กซี่",
+    "🚖": "แท็กซี่กำลังมา",
+    "🚗": "รถ",
+    "🚘": "รถกำลังมา",
+    "🚙": "รถบ้าน",
+    "🚚": "รถขนส่ง",
+    "🚛": "รถบรรทุก",
+    "🚜": "แทร็กเตอร์",
+    "🚝": "รถไฟรางเดี่ยว",
+    "🚞": "รางรถไฟภูเขา",
+    "🚥": "ไฟจราจรแนวนอน",
+    "🚦": "ไฟจราจรแนวตั้ง",
+    "🚧": "ป้ายไซต์ก่อสร้าง",
+    "🚨": "สัญญาณไฟตำรวจ",
+    "🚲": "จักรยาน",
+    "🛑": "เครื่องหมายหยุด",
+    "🛢": "ถังน้ำมัน",
+    "🛣": "ทางด่วน",
+    "🛤": "รางรถไฟ",
+    "🛴": "สกู๊ตเตอร์",
+    "🛵": "รถสกู๊ตเตอร์",
+    "🛹": "สเก็ตบอร์ด",
+    "🛺": "รถสามล้อ",
+    "🛻": "รถกระบะ",
+    "🛼": "รองเท้าสเก็ต",
+    "🦼": "วีลแชร์ไฟฟ้า",
+    "🦽": "วีลแชร์ธรรมดา",
+    "⚓": "สมอเรือ",
+    "⛴": "เรือเฟอร์รี",
+    "⛵": "เรือใบ",
+    "🚢": "เรือ",
+    "🚤": "เรือด่วน",
+    "🛥": "เรือยนต์",
+    "🛳": "เรือโดยสาร",
+    "🛶": "แคนู",
+    "✈": "เครื่องบิน",
+    "💺": "ที่นั่ง",
+    "🚀": "จรวด",
+    "🚁": "เฮลิคอปเตอร์",
+    "🚟": "รถไฟสะพานแขวน",
+    "🚠": "เคเบิลคาร์",
+    "🚡": "รถกระเช้าลอยฟ้า",
+    "🛩": "เครื่องบินเล็ก",
+    "🛫": "เครื่องบินขึ้น",
+    "🛬": "เครื่องบินลง",
+    "🛰": "ดาวเทียม",
+    "🛸": "จานบิน",
+    "🪂": "ร่มชูชีพ",
+    "🛎": "กระดิ่งโรงแรม",
+    "🧳": "กระเป๋าเดินทาง",
+    "⌚": "นาฬิกาข้อมือ",
+    "⌛": "นาฬิกาทราย",
+    "⏰": "นาฬิกาปลุก",
+    "⏱": "นาฬิกาจับเวลา",
+    "⏲": "นาฬิกานับถอยหลัง",
+    "⏳": "นาฬิกาทรายจับเวลา",
+    "🕐": "หนึ่งนาฬิกา",
+    "🕑": "สองนาฬิกา",
+    "🕒": "สามนาฬิกา",
+    "🕓": "สี่นาฬิกา",
+    "🕔": "ห้านาฬิกา",
+    "🕕": "หกนาฬิกา",
+    "🕖": "เจ็ดนาฬิกา",
+    "🕗": "แปดนาฬิกา",
+    "🕘": "เก้านาฬิกา",
+    "🕙": "สิบนาฬิกา",
+    "🕚": "สิบเอ็ดนาฬิกา",
+    "🕛": "สิบสองนาฬิกา",
+    "🕜": "หนึ่งนาฬิกาครึ่ง",
+    "🕝": "สองนาฬิกาครึ่ง",
+    "🕞": "สามนาฬิกาครึ่ง",
+    "🕟": "สี่นาฬิกาครึ่ง",
+    "🕠": "ห้านาฬิกาครึ่ง",
+    "🕡": "หกนาฬิกาครึ่ง",
+    "🕢": "เจ็ดนาฬิกาครึ่ง",
+    "🕣": "แปดนาฬิกาครึ่ง",
+    "🕤": "เก้านาฬิกาครึ่ง",
+    "🕥": "สิบนาฬิกาครึ่ง",
+    "🕦": "สิบเอ็ดนาฬิกาครึ่ง",
+    "🕧": "สิบสองนาฬิกาครึ่ง",
+    "🕰": "นาฬิกาบนหิ้ง",
+    "☀": "พระอาทิตย์",
+    "☁": "เมฆ",
+    "☂": "ร่ม",
+    "☃": "สโนว์แมน_หิมะ",
+    "☄": "ดาวหาง",
+    "☔": "ร่ม_หยดน้ำฝน",
+    "⚡": "ไฟฟ้าแรงสูง",
+    "⛄": "สโนว์แมน",
+    "⛅": "ดวงอาทิตย์หลังเมฆ",
+    "⛈": "ฝนฟ้าคะนอง",
+    "⛱": "ร่มปักดิน",
+    "❄": "เกล็ดหิมะ",
+    "⭐": "ดาวสีขาวขนาดกลาง",
+    "🌀": "ไซโคลน",
+    "🌂": "ร่มหุบ",
+    "🌈": "รุ้ง",
+    "🌊": "คลื่น",
+    "🌌": "ทางช้างเผือก",
+    "🌑": "จันทร์ดับ",
+    "🌒": "พระจันทร์เสี้ยวข้างขึ้น",
+    "🌓": "พระจันทร์ครึ่งซีกขวา",
+    "🌔": "ข้างขึ้น",
+    "🌕": "พระจันทร์เต็มดวง",
+    "🌖": "ข้างแรม",
+    "🌗": "พระจันทร์ครึ่งซีกซ้าย",
+    "🌘": "พระจันทร์เสี้ยวข้างแรม",
+    "🌙": "พระจันทร์เสี้ยว",
+    "🌚": "หน้าพระจันทร์ดับ",
+    "🌛": "หน้าพระจันทร์เสี้ยวขวา",
+    "🌜": "หน้าพระจันทร์เสี้ยวซ้าย",
+    "🌝": "หน้าพระจันทร์เต็มดวง",
+    "🌞": "หน้าพระอาทิตย์",
+    "🌟": "ดาวส่องแสง",
+    "🌠": "ดาวตก",
+    "🌡": "เครื่องวัดอุณหภูมิ",
+    "🌤": "เมฆน้อยบดบังพระอาทิตย์",
+    "🌥": "เมฆก้อนใหญ่บังพระอาทิตย์",
+    "🌦": "เมฆฝนบดบังพระอาทิตย์",
+    "🌧": "เมฆฝน",
+    "🌨": "เมฆ_หิมะ",
+    "🌩": "เมฆ_ฟ้าแลบ",
+    "🌪": "พายุทอร์นาโด",
+    "🌫": "หมอก",
+    "🌬": "พ่นลม",
+    "💧": "หยดน้ำ",
+    "🔥": "ไฟ",
+    "🪐": "ดาวเคราะห์ที่มีวงแหวน",
+    "✨": "ประกายวิบวับ",
+    "🎀": "ริบบิ้น",
+    "🎁": "ของขวัญ",
+    "🎃": "ฟักทองฮาโลวีน",
+    "🎄": "ต้นคริสต์มาส",
+    "🎆": "พลุ",
+    "🎇": "ดอกไม้ไฟ",
+    "🎈": "ลูกโป่ง",
+    "🎉": "ปาร์ตี้",
+    "🎊": "ลูกบอลใส่เศษกระดาษงานปาร์ตี้",
+    "🎋": "ต้นไม้ประดับคำอวยพร",
+    "🎍": "ต้นสนประดับ",
+    "🎎": "ตุ๊กตาญี่ปุ่น",
+    "🎏": "ธงปลาคาร์พ",
+    "🎐": "โมบายล์กระดิ่ง",
+    "🎑": "ไหว้พระจันทร์",
+    "🎗": "ริบบิ้นรำลึก",
+    "🎟": "ตั๋วเข้าชม",
+    "🎫": "ตั๋ว",
+    "🧧": "อั่งเปา",
+    "🧨": "ประทัด",
+    "🎖": "เหรียญกล้าหาญ",
+    "🏅": "เหรียญรางวัล",
+    "🏆": "ถ้วยรางวัล",
+    "🥇": "เหรียญทอง",
+    "🥈": "เหรียญเงิน",
+    "🥉": "เหรียญทองแดง",
+    "⚽": "ลูกฟุตบอล",
+    "⚾": "เบสบอล",
+    "⛳": "ธงในหลุม",
+    "⛸": "สเก็ตน้ำแข็ง",
+    "🎣": "ตกปลา",
+    "🎳": "โบว์ลิ่ง",
+    "🎽": "เสื้อวิ่ง",
+    "🎾": "เทนนิส",
+    "🎿": "สกี",
+    "🏀": "บาสเกตบอล",
+    "🏈": "อเมริกันฟุตบอล",
+    "🏉": "รักบี้",
+    "🏏": "คริกเก็ต",
+    "🏐": "วอลเลย์บอล",
+    "🏑": "ฮอกกี้",
+    "🏒": "ไม้ฮอกกี้",
+    "🏓": "ปิงปอง",
+    "🏸": "แบดมินตัน",
+    "🛷": "เลื่อนหิมะ",
+    "🤿": "หน้ากากดำน้ำ",
+    "🥅": "โกล",
+    "🥊": "นวม",
+    "🥋": "ชุดยูโด",
+    "🥌": "ลูกกลิ้งหิน",
+    "🥍": "ลาครอส",
+    "🥎": "ซอฟต์บอล",
+    "🥏": "จานร่อน",
+    "♟": "หมากรุก",
+    "♠": "โพดำ",
+    "♣": "ดอกจิก",
+    "♥": "โพแดง",
+    "♦": "ข้าวหลามตัด",
+    "🀄": "ไพ่นกกระจอกมังกรแดง",
+    "🃏": "ไพ่โจ๊กเกอร์",
+    "🎮": "วิดีโอเกม",
+    "🎯": "กลางเป้า",
+    "🎰": "สล็อตแมชชีน",
+    "🎱": "บิลเลียด",
+    "🎲": "ลูกเต๋า",
+    "🎴": "ไพ่ดอกไม้",
+    "🔮": "ลูกแก้ววิเศษ",
+    "🕹": "จอยสติ๊ก",
+    "🧩": "จิ๊กซอว์",
+    "🧸": "ตุ๊กตาหมี",
+    "🧿": "เครื่องราง",
+    "🪀": "โยโย่",
+    "🪁": "ว่าว",
+    "🪄": "ไม้กายสิทธิ์",
+    "🪅": "ปิญญาตา",
+    "🪆": "ตุ๊กตาแม่ลูกดก",
+    "🎨": "จานสีวาดรูป",
+    "🎭": "หน้ากาก",
+    "🖼": "รูปใส่กรอบ",
+    "🧵": "ด้าย",
+    "🧶": "ไหมพรม",
+    "🪡": "เข็มเย็บผ้า",
+    "🪢": "เงื่อน",
+    "⛑": "หมวกนิรภัยมีกากบาทขาว",
+    "🎒": "เป้นักเรียน",
+    "🎓": "หมวกรับปริญญา",
+    "🎩": "หมวกสูง",
+    "👑": "มงกุฎ",
+    "👒": "หมวกผู้หญิง",
+    "👓": "แว่นตา",
+    "👔": "เนคไท",
+    "👕": "เสื้อยืด",
+    "👖": "ยีนส์",
+    "👗": "ชุดกระโปรง",
+    "👘": "กิโมโน",
+    "👙": "บิกินี",
+    "👚": "เสื้อผู้หญิง",
+    "👛": "กระเป๋าใส่เงิน",
+    "👜": "กระเป๋าถือ",
+    "👝": "กระเป๋าใบเล็ก",
+    "👞": "รองเท้าชาย",
+    "👟": "รองเท้ากีฬา",
+    "👠": "รองเท้าส้นสูง",
+    "👡": "รองเท้าแตะผู้หญิง",
+    "👢": "รองเท้าบู๊ตผู้หญิง",
+    "💄": "ลิปสติก",
+    "💍": "แหวน",
+    "💎": "อัญมณี",
+    "📿": "ลูกประคำ",
+    "🕶": "แว่นกันแดด",
+    "🛍": "ถุงช็อปปิ้ง",
+    "🥻": "ชุดส่าหรี",
+    "🥼": "เสื้อกาวน์",
+    "🥽": "แว่นตากันลม",
+    "🥾": "รองเท้าปีนเขา",
+    "🥿": "รองเท้าส้นเตี้ย",
+    "🦺": "เสื้อนิรภัย",
+    "🧢": "หมวกแก๊ป",
+    "🧣": "ผ้าพันคอ",
+    "🧤": "ถุงมือ",
+    "🧥": "เสื้อโค้ต",
+    "🧦": "ถุงเท้า",
+    "🩰": "รองเท้าบัลเล่ต์",
+    "🩱": "ชุดว่ายน้ำวันพีซ",
+    "🩲": "กางเกงชั้นในชาย",
+    "🩳": "กางเกงขาสั้น",
+    "🩴": "รองเท้าลำลอง",
+    "🪖": "หมวกทหาร",
+    "📢": "เครื่องขยายเสียง",
+    "📣": "โทรโข่ง",
+    "📯": "แตรส่งสาร",
+    "🔇": "ไม่ใช้เสียง",
+    "🔈": "ลำโพงเสียงเบา",
+    "🔉": "ลำโพงเสียงปานกลาง",
+    "🔊": "ลำโพงเสียงดัง",
+    "🔔": "กระดิ่ง",
+    "🔕": "ไม่มีกระดิ่ง",
+    "🎙": "ไมค์สตูดิโอ",
+    "🎚": "ที่ปรับระดับเสียง",
+    "🎛": "ปุ่มควบคุมเสียง",
+    "🎤": "ไมโครโฟน",
+    "🎧": "หูฟัง",
+    "🎵": "โน้ตดนตรี",
+    "🎶": "โน้ตดนตรีหลายตัว",
+    "🎼": "บรรทัดห้าเส้น",
+    "📻": "วิทยุ",
+    "🎷": "แซ็กโซโฟน",
+    "🎸": "กีต้าร์",
+    "🎹": "เปียโน",
+    "🎺": "ทรัมเป็ต",
+    "🎻": "ไวโอลิน",
+    "🥁": "กลอง",
+    "🪕": "แบนโจ",
+    "🪗": "แอคคอร์เดียน",
+    "🪘": "กลองยาว",
+    "☎": "โทรศัพท์",
+    "📞": "หูโทรศัพท์",
+    "📟": "เพจเจอร์",
+    "📠": "แฟกซ์",
+    "📱": "โทรศัพท์มือถือ",
+    "📲": "โทรศัพท์มือถือพร้อมลูกศรชี้",
+    "⌨": "แป้นพิมพ์",
+    "💻": "แล็ปท็อป",
+    "💽": "แผ่นดิสก์",
+    "💾": "ฟลอปปี้ดิสก์",
+    "💿": "บลูเรย์",
+    "📀": "ดีวีดี",
+    "🔋": "แบตเตอรี่",
+    "🔌": "ปลั๊กไฟ",
+    "🖥": "คอมพิวเตอร์เดสก์ท็อป",
+    "🖨": "เครื่องพิมพ์",
+    "🖱": "เมาส์",
+    "🖲": "แทร็กบอล",
+    "🧮": "ลูกคิด",
+    "🎞": "เฟรมภาพยนตร์",
+    "🎥": "กล้องถ่ายภาพยนตร์",
+    "🎬": "สเลท",
+    "🏮": "โคมไฟแดง",
+    "💡": "หลอดไฟ",
+    "📷": "กล้อง",
+    "📸": "กล้องเปิดแฟลช",
+    "📹": "กล้องวิดีโอ",
+    "📺": "ทีวี",
+    "📼": "วิดีโอเทป",
+    "📽": "เครื่องฉายหนัง",
+    "🔍": "แว่นขยายเอียงซ้าย",
+    "🔎": "แว่นขยายเอียงขวา",
+    "🔦": "ไฟฉาย",
+    "🕯": "เทียน",
+    "🪔": "ตะเกียงดิยา",
+    "🏷": "ป้าย",
+    "📃": "เอกสารม้วนปลาย",
+    "📄": "เอกสาร",
+    "📑": "แถบคั่นหน้า",
+    "📒": "สมุดโน้ตเจาะรูข้าง",
+    "📓": "สมุดโน้ต",
+    "📔": "สมุดโน้ตมีลาย",
+    "📕": "หนังสือปิด",
+    "📖": "หนังสือเปิด",
+    "📗": "หนังสือสีเขียว",
+    "📘": "หนังสือสีน้ำเงิน",
+    "📙": "หนังสือสีส้ม",
+    "📚": "หนังสือ",
+    "📜": "ม้วนกระดาษ",
+    "📰": "หนังสือพิมพ์",
+    "🔖": "ที่คั่นหนังสือ",
+    "🗞": "ม้วนหนังสือพิมพ์",
+    "💰": "ถุงเงิน",
+    "💳": "บัตรเครดิต",
+    "💴": "ธนบัตรเยน",
+    "💵": "ธนบัตรดอลลาร์",
+    "💶": "ธนบัตรยูโร",
+    "💷": "ธนบัตรปอนด์",
+    "💸": "เงินบิน",
+    "💹": "ตลาดขึ้น",
+    "🧾": "ใบเสร็จ",
+    "🪙": "เหรียญ",
+    "✉": "ซองจดหมาย",
+    "📤": "กล่องขาออก",
+    "📥": "กล่องขาเข้า",
+    "📦": "พัสดุ",
+    "📧": "อีเมล",
+    "📨": "จดหมายเข้า",
+    "📩": "จดหมายออก",
+    "📪": "กล่องจดหมายปิดคว่ำธง",
+    "📫": "กล่องจดหมายปิดยกธง",
+    "📬": "กล่องจดหมายมีจดหมาย",
+    "📭": "กล่องจดหมายว่าง",
+    "📮": "ตู้ไปรษณีย์",
+    "🗳": "หีบบัตรลงคะแนน",
+    "✏": "ดินสอ",
+    "✒": "ปลายปากกาสีดำ",
+    "📝": "จดบันทึก",
+    "🖊": "ปากกา",
+    "🖋": "ปากกาหมึกซึม",
+    "🖌": "แปรงทาสี",
+    "🖍": "ดินสอสี",
+    "✂": "กรรไกร",
+    "💼": "กระเป๋าเอกสาร",
+    "📁": "แฟ้มเอกสาร",
+    "📂": "เปิดเอกสาร",
+    "📅": "ปฏิทิน",
+    "📆": "ปฏิทินแบบฉีกออก",
+    "📇": "ที่ใส่บัตร",
+    "📈": "แนวโน้มขึ้น",
+    "📉": "แนวโน้มลง",
+    "📊": "กราฟแท่ง",
+    "📋": "คลิปบอร์ด",
+    "📌": "หมุดปัก",
+    "📍": "หมุดหัวกลม",
+    "📎": "คลิปหนีบกระดาษ",
+    "📏": "ไม้บรรทัดตรง",
+    "📐": "ไม้บรรทัดสามเหลี่ยม",
+    "🖇": "คลิปหนีบกระดาษคู่",
+    "🗂": "ที่คั่นบัตรรายการ",
+    "🗃": "กล่องใส่แฟ้ม",
+    "🗄": "ตู้เอกสาร",
+    "🗑": "ตะกร้าขยะ",
+    "🗒": "สมุดโน้ตสันห่วง",
+    "🗓": "ปฏิทินสันห่วง",
+    "🔏": "ล็อคด้วยปากกา",
+    "🔐": "ล็อคด้วยกุญแจ",
+    "🔑": "กุญแจ",
+    "🔒": "ล็อคปิด",
+    "🔓": "ล็อคเปิด",
+    "🗝": "กุญแจเก่า",
+    "⚒": "ค้อน_จอบ",
+    "⚔": "ดาบไขว้",
+    "⚖": "ตราชั่ง",
+    "⚙": "เฟือง",
+    "⛏": "จอบ",
+    "⛓": "โซ่",
+    "🏹": "ธนู_ลูกศร",
+    "🔗": "สัญลักษณ์การลิงก์",
+    "🔧": "ประแจ",
+    "🔨": "ค้อน",
+    "🔩": "สลัก_น็อต",
+    "🔫": "ปืนฉีดน้ำ",
+    "🗜": "เครื่องบีบอัด",
+    "🗡": "ดาบสั้น",
+    "🛠": "ค้อน_ประแจ",
+    "🛡": "โล่",
+    "🦯": "ไม้เท้านำทาง",
+    "🧰": "กล่องเครื่องมือ",
+    "🧲": "แม่เหล็ก",
+    "🪃": "บูมเมอแรง",
+    "🪓": "ขวาน",
+    "🪚": "เลื่อย",
+    "🪛": "ไขควง",
+    "🪜": "บันไดปีน",
+    "🪝": "ตะขอ",
+    "⚗": "อุปกรณ์กลั่น",
+    "📡": "จานดาวเทียม",
+    "🔬": "กล้องจุลทรรศน์",
+    "🔭": "กล้องโทรทรรศน์",
+    "🧪": "หลอดทดลอง",
+    "🧫": "จานเพาะเชื้อ",
+    "🧬": "ดีเอ็นเอ",
+    "💉": "กระบอกฉีดยา",
+    "💊": "ยาเม็ด",
+    "🩸": "หยดเลือด",
+    "🩹": "พลาสเตอร์ปิดแผล",
+    "🩺": "เครื่องฟังตรวจ",
+    "🚪": "ประตู",
+    "🚽": "โถส้วม",
+    "🚿": "ฝักบัว",
+    "🛁": "อ่างอาบน้ำ",
+    "🛋": "โซฟากับโคมไฟ",
+    "🛏": "เตียง",
+    "🛒": "รถเข็น",
+    "🛗": "ลิฟต์",
+    "🧯": "ที่ดับเพลิง",
+    "🧴": "ขวดโลชั่น",
+    "🧷": "เข็มกลัดซ่อนปลาย",
+    "🧹": "ไม้กวาด",
+    "🧺": "ตะกร้า",
+    "🧻": "กระดาษชำระ",
+    "🧼": "สบู่",
+    "🧽": "ฟองน้ำ",
+    "🪑": "เก้าอี้",
+    "🪒": "ใบมีดโกน",
+    "🪞": "กระจก",
+    "🪟": "หน้าต่าง",
+    "🪠": "ที่ปั๊มชักโครก",
+    "🪣": "ถัง",
+    "🪤": "กับดักหนู",
+    "🪥": "แปรงสีฟัน",
+    "⚰": "โลงศพ",
+    "⚱": "โกศกระดูก",
+    "🗿": "รูปปั้นโมไอ",
+    "🚬": "ป้ายสูบบุหรี่",
+    "🪦": "แผ่นหินจารึก",
+    "🪧": "ป้ายประกาศ",
+    "♿": "รถเข็นผู้ป่วย",
+    "🏧": "เอทีเอ็ม",
+    "🚮": "ทิ้งขยะให้ลงถัง",
+    "🚰": "น้ำดื่ม",
+    "🚹": "ห้องน้ำชาย",
+    "🚺": "ห้องน้ำหญิง",
+    "🚻": "ห้องน้ำ",
+    "🚼": "ป้ายทารก",
+    "🚾": "ป้ายห้องน้ำ",
+    "🛂": "ตรวจพาสปอร์ต",
+    "🛃": "ศุลกากร",
+    "🛄": "รับสัมภาระ",
+    "🛅": "บริการฝากกระเป๋า",
+    "☢": "กัมมันตรังสี",
+    "☣": "เชื้อโรคอันตราย",
+    "⚠": "ป้ายระวัง",
+    "⛔": "ป้ายห้ามเข้า",
+    "📵": "ห้ามใช้โทรศัพท์",
+    "🔞": "ห้ามอายุต่ำกว่า_18_ปี",
+    "🚫": "ป้ายหวงห้าม",
+    "🚭": "ป้ายห้ามสูบบุหรี่",
+    "🚯": "ห้ามทิ้งขยะ",
+    "🚱": "ห้ามดื่มน้ำ",
+    "🚳": "ห้ามจักรยาน",
+    "🚷": "ห้ามคนเดินเท้า",
+    "🚸": "เด็กๆ_ข้ามถนน",
+    "↔": "ลูกศรชี้ซ้ายขวา",
+    "↕": "ลูกศรชี้ขึ้นลง",
+    "↖": "ลูกศรชี้มุมซ้ายบน",
+    "↗": "ลูกศรชี้มุมขวาบน",
+    "↘": "ลูกศรชี้มุมขวาล่าง",
+    "↙": "ลูกศรชี้มุุมซ้ายล่าง",
+    "↩": "ลูกศรวนซ้าย",
+    "↪": "ลูกศรวนขวา",
+    "➡": "ลูกศรชี้ไปทางขวา",
+    "⤴": "ลูกศรโค้งขึ้น",
+    "⤵": "ลูกศรโค้งลง",
+    "⬅": "ลูกศรชี้ไปทางซ้าย",
+    "⬆": "ลูกศรชี้ขึ้น",
+    "⬇": "ลูกศรชี้ลง",
+    "🔃": "สัญลักษณ์โหลดซ้ำ",
+    "🔄": "ลูกศรทวนเข็มนาฬิกา",
+    "🔙": "กลับ",
+    "🔚": "สิ้นสุด",
+    "🔛": "เปิด",
+    "🔜": "สัญลักษณ์เร็วๆ_นี้",
+    "🔝": "สัญลักษณ์บน",
+    "☦": "ไม้กางเขนออร์โธดอกซ์",
+    "☪": "พระจันทร์เสี้ยว_ดาว",
+    "☮": "เครื่องหมายสันติภาพ",
+    "☯": "หยินหยาง",
+    "☸": "ธรรมจักร",
+    "⚛": "อะตอม",
+    "✝": "ไม้กางเขนละติน",
+    "✡": "สตาร์ออฟเดวิด",
+    "🔯": "ดาว_6_แฉก",
+    "🕉": "เครื่องหมายโอม",
+    "🕎": "เชิงเทียน_7_กิ่ง",
+    "🛐": "ที่บูชา",
+    "♈": "ราศีเมษ",
+    "♉": "ราศีพฤษภ",
+    "♊": "ราศีเมถุน",
+    "♋": "ราศีกรกฎ",
+    "♌": "ราศีสิงห์",
+    "♍": "ราศีกันย์",
+    "♎": "ราศีตุลย์",
+    "♏": "ราศีพิจิก",
+    "♐": "ราศีธนู",
+    "♑": "ราศีมังกร",
+    "♒": "ราศีกุมภ์",
+    "♓": "ราศีมีน",
+    "⛎": "กลุ่มดาวคนแบกงู",
+    "⏏": "ปุ่มดีดออก",
+    "⏩": "เร่งไปข้างหน้า",
+    "⏪": "ถอยกลับ",
+    "⏫": "ลูกศรขึ้น",
+    "⏬": "ลูกศรลง",
+    "⏭": "เล่นแทร็กถัดไป",
+    "⏮": "ปุ่มแทร็กก่อนหน้า",
+    "⏯": "เล่นหรือหยุดชั่วคราว",
+    "⏸": "ปุ่มหยุุดชั่วคราว",
+    "⏹": "ปุ่มหยุด",
+    "⏺": "ปุ่มอัด",
+    "▶": "เล่น",
+    "◀": "ย้อนกลับ",
+    "🎦": "โรงภาพยนตร์",
+    "📳": "โหมดสั่น",
+    "📴": "ปิดมือถือ",
+    "📶": "สัญญาณมือถือ",
+    "🔀": "ลูกศรไขว้",
+    "🔁": "เล่นซ้ำ",
+    "🔂": "เล่นซ้ำเพลงเดียว",
+    "🔅": "แสงสว่างน้อย",
+    "🔆": "แสงสว่างมาก",
+    "🔼": "ปุ่มสามเหลี่ยมขึ้น",
+    "🔽": "ปุ่มสามเหลี่ยมลง",
+    "♀": "สัญลักษณ์เพศหญิง",
+    "♂": "สัญลักษณ์เพศชาย",
+    "⚧": "สัญลักษณ์คนข้ามเพศ",
+    "©": "ลิขสิทธิ์",
+    "®": "จดทะเบียน",
+    "‼": "เครื่องหมายอัศเจรีย์คู่",
+    "⁉": "เครื่องหมายอัศเจรีย์_คำถาม",
+    "™": "เครื่องหมายการค้า",
+    "☑": "กล่องกาเครื่องหมายมีเครื่องหมายถูก",
+    "♻": "สัญลักษณ์รีไซเคิล",
+    "♾": "ไม่มีที่สิ้นสุด",
+    "⚕": "เครื่องหมายการแพทย์",
+    "⚜": "สัญลักษณ์ดอกลิลลี่",
+    "✅": "ปุ่มเครื่องหมายถูก",
+    "✔": "เครื่องหมายถูก",
+    "✖": "คูณ",
+    "✳": "ดอกจัน_8_ซี่",
+    "✴": "ดาว_8_แฉก",
+    "❇": "เปล่งประกาย",
+    "❌": "เครื่องหมายกากบาท",
+    "❎": "ปุ่มเครื่องหมายกากบาท",
+    "❓": "เครื่องหมายคำถาม",
+    "❔": "เครื่องหมายคำถามสีขาว",
+    "❕": "เครื่องหมายอัศเจรีย์สีขาว",
+    "❗": "เครื่องหมายอัศเจรีย์สีแดง",
+    "➕": "บวก",
+    "➖": "ลบ",
+    "➗": "หาร",
+    "➰": "ห่วง",
+    "➿": "ห่วง_2_รู",
+    "⭕": "วงกลมกลวงสีแดง",
+    "〰": "เส้นคลื่น",
+    "〽": "เครื่องหมายเปลี่ยนท่อน",
+    "💱": "การแลกเปลี่ยนเงิน",
+    "💲": "สัญลักษณ์ดอลลาร์",
+    "📛": "ป้ายชื่อ",
+    "🔰": "สัญลักษณ์มือใหม่หัดขับ",
+    "🔱": "ฉมวก",
+    "🔟": "ปุ่มกดเลข_10",
+    "ℹ": "แหล่งข้อมูล",
+    "Ⓜ": "ตัวเอ็มในวงกลม",
+    "㊗": "ภาษาญี่ปุ่นคำว่า_“แสดงความยินดี”",
+    "㊙": "ภาษาญี่ปุ่นคำว่า_“ความลับ”",
+    "🅰": "เลือดกรุ๊ปเอ",
+    "🅱": "เลือดกรุ๊ปบี",
+    "🅾": "เลือดกรุ๊ปโอ",
+    "🅿": "ที่จอดรถ",
+    "🆎": "เลือดกรุ๊ปเอบี",
+    "🆑": "ลบข้อมูล",
+    "🆒": "เย็น",
+    "🆓": "ฟรี",
+    "🆔": "หมายเลขประจำตัว",
+    "🆕": "ใหม่",
+    "🆖": "ปุ่มเอ็นจี",
+    "🆗": "โอเค",
+    "🆘": "ช่วยด้วย",
+    "🆙": "ขึ้น",
+    "🆚": "ต่อสู้กับ",
+    "🈁": "ภาษาญี่ปุ่นคำว่า_“ที่นี่”",
+    "🈂": "ภาษาญี่ปุ่นคำว่า_“ค่าบริการ”",
+    "🈚": "ภาษาญี่ปุ่นคำว่า_“ไม่มี”",
+    "🈯": "ภาษาญี่ปุ่นคำว่า_“จองแล้ว”",
+    "🈲": "ภาษาญี่ปุ่นคำว่า_“ห้าม”",
+    "🈳": "ภาษาจีนว่างเปล่า",
+    "🈴": "ภาษาญี่ปุ่นคำว่า_“ผ่าน”",
+    "🈵": "ภาษาญี่ปุ่นคำว่า_“เต็ม”",
+    "🈶": "ภาษาญี่ปุ่นคำว่า_“คิดค่าใช้จ่าย”",
+    "🈷": "ภาษาญี่ปุ่นคำว่า_“จำนวนต่อเดือน”",
+    "🈸": "ภาษาญี่ปุ่นคำว่า_“ใบสมัคร”",
+    "🈹": "ภาษาญี่ปุ่นคำว่า_“ส่วนลด”",
+    "🈺": "ภาษาญี่ปุ่นคำว่า_“เปิดทำการ”",
+    "🉐": "ภาษาญี่ปุ่นคำว่า_“ราคาถูก”",
+    "🉑": "ภาษาญี่ปุ่นคำว่า_“ยอมรับได้”",
+    "🔠": "อักษรตัวพิมพ์ใหญ่",
+    "🔡": "อักษรตัวพิมพ์เล็ก",
+    "🔢": "หมายเลข",
+    "🔣": "สัญลักษณ์",
+    "🔤": "ตัวอักษรภาษาอังกฤษ",
+    "▪": "สี่เหลี่ยมเล็กสีดำ",
+    "▫": "สี่เหลี่ยมเล็กสีขาว",
+    "◻": "สี่เหลี่ยมขนาดกลางสีขาว",
+    "◼": "สี่เหลี่ยมขนาดกลางสีดำ",
+    "◽": "สี่เหลี่ยมเล็กปานกลางสีขาว",
+    "◾": "สี่เหลี่ยมเล็กปานกลางสีดำ",
+    "⚪": "วงกลมสีขาว",
+    "⚫": "วงกลมสีดำ",
+    "⬛": "สี่เหลี่ยมใหญ่สีดำ",
+    "⬜": "สี่เหลี่ยมใหญ่สีขาว",
+    "💠": "ข้าวหลามตัดมีจุดตรงกลาง",
+    "🔘": "ปุ่มวิทยุ",
+    "🔲": "ปุ่มสี่เหลี่ยมขอบดำ",
+    "🔳": "ปุ่มสี่เหลี่ยมขอบขาว",
+    "🔴": "วงกลมสีแดง",
+    "🔵": "วงกลมสีน้ำเงิน",
+    "🔶": "เพชรใหญ่สีส้ม",
+    "🔷": "เพชรใหญ่สีน้ำเงิน",
+    "🔸": "เพชรเล็กสีส้ม",
+    "🔹": "เพชรเล็กสีน้ำเงิน",
+    "🔺": "สามเหลี่ยมหงายสีแดง",
+    "🔻": "สามเหลี่ยมคว่ำสีแดง",
+    "🟠": "วงกลมสีส้ม",
+    "🟡": "วงกลมสีเหลือง",
+    "🟢": "วงกลมสีเขียว",
+    "🟣": "วงกลมสีม่วง",
+    "🟤": "วงกลมสีน้ำตาล",
+    "🟥": "สี่เหลี่ยมสีแดง",
+    "🟦": "สี่เหลี่ยมสีน้ำเงิน",
+    "🟧": "สี่เหลี่ยมสีส้ม",
+    "🟨": "สี่เหลี่ยมสีเหลือง",
+    "🟩": "สี่เหลี่ยมสีเขียว",
+    "🟪": "สี่เหลี่ยมสีม่วง",
+    "🟫": "สี่เหลี่ยมสีน้ำตาล",
+    "🎌": "ธงไขว้",
+    "🏁": "ธงตราหมากรุก",
+    "🏳": "ธงขาว",
+    "🏴": "ธงดำ",
+    "🚩": "ธงปักตำแหน่ง",
+    "#⃣": "ปุ่มกดเลข_#",
+    "*⃣": "ปุ่มกดเลข_*",
+    "0⃣": "ปุ่มกดเลข_0",
+    "1⃣": "ปุ่มกดเลข_1",
+    "2⃣": "ปุ่มกดเลข_2",
+    "3⃣": "ปุ่มกดเลข_3",
+    "4⃣": "ปุ่มกดเลข_4",
+    "5⃣": "ปุ่มกดเลข_5",
+    "6⃣": "ปุ่มกดเลข_6",
+    "7⃣": "ปุ่มกดเลข_7",
+    "8⃣": "ปุ่มกดเลข_8",
+    "9⃣": "ปุ่มกดเลข_9",
+    "🇦🇨": "ธง_เกาะแอสเซนชัน",
+    "🇦🇩": "ธง_อันดอร์รา",
+    "🇦🇪": "ธง_สหรัฐอาหรับเอมิเรตส์",
+    "🇦🇫": "ธง_อัฟกานิสถาน",
+    "🇦🇬": "ธง_แอนติกา_บาร์บูดา",
+    "🇦🇮": "ธง_แองกวิลลา",
+    "🇦🇱": "ธง_แอลเบเนีย",
+    "🇦🇲": "ธง_อาร์เมเนีย",
+    "🇦🇴": "ธง_แองโกลา",
+    "🇦🇶": "ธง_แอนตาร์กติกา",
+    "🇦🇷": "ธง_อาร์เจนตินา",
+    "🇦🇸": "ธง_อเมริกันซามัว",
+    "🇦🇹": "ธง_ออสเตรีย",
+    "🇦🇺": "ธง_ออสเตรเลีย",
+    "🇦🇼": "ธง_อารูบา",
+    "🇦🇽": "ธง_หมู่เกาะโอลันด์",
+    "🇦🇿": "ธง_อาเซอร์ไบจาน",
+    "🇧🇦": "ธง_บอสเนีย_เฮอร์เซโกวีนา",
+    "🇧🇧": "ธง_บาร์เบโดส",
+    "🇧🇩": "ธง_บังกลาเทศ",
+    "🇧🇪": "ธง_เบลเยียม",
+    "🇧🇫": "ธง_บูร์กินาฟาโซ",
+    "🇧🇬": "ธง_บัลแกเรีย",
+    "🇧🇭": "ธง_บาห์เรน",
+    "🇧🇮": "ธง_บุรุนดี",
+    "🇧🇯": "ธง_เบนิน",
+    "🇧🇱": "ธง_เซนต์บาร์เธเลมี",
+    "🇧🇲": "ธง_เบอร์มิวดา",
+    "🇧🇳": "ธง_บรูไน",
+    "🇧🇴": "ธง_โบลิเวีย",
+    "🇧🇶": "ธง_เนเธอร์แลนด์แคริบเบียน",
+    "🇧🇷": "ธง_บราซิล",
+    "🇧🇸": "ธง_บาฮามาส",
+    "🇧🇹": "ธง_ภูฏาน",
+    "🇧🇻": "ธง_เกาะบูเว",
+    "🇧🇼": "ธง_บอตสวานา",
+    "🇧🇾": "ธง_เบลารุส",
+    "🇧🇿": "ธง_เบลีซ",
+    "🇨🇦": "ธง_แคนาดา",
+    "🇨🇨": "ธง_หมู่เกาะโคโคส_(คีลิง)",
+    "🇨🇩": "ธง_คองโก_-_กินชาซา",
+    "🇨🇫": "ธง_สาธารณรัฐแอฟริกากลาง",
+    "🇨🇬": "ธง_คองโก_-_บราซซาวิล",
+    "🇨🇭": "ธง_สวิตเซอร์แลนด์",
+    "🇨🇮": "ธง_โกตดิวัวร์",
+    "🇨🇰": "ธง_หมู่เกาะคุก",
+    "🇨🇱": "ธง_ชิลี",
+    "🇨🇲": "ธง_แคเมอรูน",
+    "🇨🇳": "ธง_จีน",
+    "🇨🇴": "ธง_โคลอมเบีย",
+    "🇨🇵": "ธง_เกาะคลิปเปอร์ตัน",
+    "🇨🇷": "ธง_คอสตาริกา",
+    "🇨🇺": "ธง_คิวบา",
+    "🇨🇻": "ธง_เคปเวิร์ด",
+    "🇨🇼": "ธง_คูราเซา",
+    "🇨🇽": "ธง_เกาะคริสต์มาส",
+    "🇨🇾": "ธง_ไซปรัส",
+    "🇨🇿": "ธง_เช็ก",
+    "🇩🇪": "ธง_เยอรมนี",
+    "🇩🇬": "ธง_ดิเอโกการ์เซีย",
+    "🇩🇯": "ธง_จิบูตี",
+    "🇩🇰": "ธง_เดนมาร์ก",
+    "🇩🇲": "ธง_โดมินิกา",
+    "🇩🇴": "ธง_สาธารณรัฐโดมินิกัน",
+    "🇩🇿": "ธง_แอลจีเรีย",
+    "🇪🇦": "ธง_เซวตา_เมลียา",
+    "🇪🇨": "ธง_เอกวาดอร์",
+    "🇪🇪": "ธง_เอสโตเนีย",
+    "🇪🇬": "ธง_อียิปต์",
+    "🇪🇭": "ธง_ซาฮาราตะวันตก",
+    "🇪🇷": "ธง_เอริเทรีย",
+    "🇪🇸": "ธง_สเปน",
+    "🇪🇹": "ธง_เอธิโอเปีย",
+    "🇪🇺": "ธง_สหภาพยุโรป",
+    "🇫🇮": "ธง_ฟินแลนด์",
+    "🇫🇯": "ธง_ฟิจิ",
+    "🇫🇰": "ธง_หมู่เกาะฟอล์กแลนด์",
+    "🇫🇲": "ธง_ไมโครนีเซีย",
+    "🇫🇴": "ธง_หมู่เกาะแฟโร",
+    "🇫🇷": "ธง_ฝรั่งเศส",
+    "🇬🇦": "ธง_กาบอง",
+    "🇬🇧": "ธง_สหราชอาณาจักร",
+    "🇬🇩": "ธง_เกรเนดา",
+    "🇬🇪": "ธง_จอร์เจีย",
+    "🇬🇫": "ธง_เฟรนช์เกียนา",
+    "🇬🇬": "ธง_เกิร์นซีย์",
+    "🇬🇭": "ธง_กานา",
+    "🇬🇮": "ธง_ยิบรอลตาร์",
+    "🇬🇱": "ธง_กรีนแลนด์",
+    "🇬🇲": "ธง_แกมเบีย",
+    "🇬🇳": "ธง_กินี",
+    "🇬🇵": "ธง_กวาเดอลูป",
+    "🇬🇶": "ธง_อิเควทอเรียลกินี",
+    "🇬🇷": "ธง_กรีซ",
+    "🇬🇸": "ธง_เกาะเซาท์จอร์เจีย_หมู่เกาะเซาท์แซนด์วิช",
+    "🇬🇹": "ธง_กัวเตมาลา",
+    "🇬🇺": "ธง_กวม",
+    "🇬🇼": "ธง_กินี-บิสเซา",
+    "🇬🇾": "ธง_กายอานา",
+    "🇭🇰": "ธง_เขตปกครองพิเศษฮ่องกงแห่งสาธารณรัฐประชาชนจีน",
+    "🇭🇲": "ธง_เกาะเฮิร์ด_หมู่เกาะแมกดอนัลด์",
+    "🇭🇳": "ธง_ฮอนดูรัส",
+    "🇭🇷": "ธง_โครเอเชีย",
+    "🇭🇹": "ธง_เฮติ",
+    "🇭🇺": "ธง_ฮังการี",
+    "🇮🇨": "ธง_หมู่เกาะคานารี",
+    "🇮🇩": "ธง_อินโดนีเซีย",
+    "🇮🇪": "ธง_ไอร์แลนด์",
+    "🇮🇱": "ธง_อิสราเอล",
+    "🇮🇲": "ธง_เกาะแมน",
+    "🇮🇳": "ธง_อินเดีย",
+    "🇮🇴": "ธง_บริติชอินเดียนโอเชียนเทร์ริทอรี",
+    "🇮🇶": "ธง_อิรัก",
+    "🇮🇷": "ธง_อิหร่าน",
+    "🇮🇸": "ธง_ไอซ์แลนด์",
+    "🇮🇹": "ธง_อิตาลี",
+    "🇯🇪": "ธง_เจอร์ซีย์",
+    "🇯🇲": "ธง_จาเมกา",
+    "🇯🇴": "ธง_จอร์แดน",
+    "🇯🇵": "ธง_ญี่ปุ่น",
+    "🇰🇪": "ธง_เคนยา",
+    "🇰🇬": "ธง_คีร์กีซสถาน",
+    "🇰🇭": "ธง_กัมพูชา",
+    "🇰🇮": "ธง_คิริบาส",
+    "🇰🇲": "ธง_คอโมโรส",
+    "🇰🇳": "ธง_เซนต์คิตส์_เนวิส",
+    "🇰🇵": "ธง_เกาหลีเหนือ",
+    "🇰🇷": "ธง_เกาหลีใต้",
+    "🇰🇼": "ธง_คูเวต",
+    "🇰🇾": "ธง_หมู่เกาะเคย์แมน",
+    "🇰🇿": "ธง_คาซัคสถาน",
+    "🇱🇦": "ธง_ลาว",
+    "🇱🇧": "ธง_เลบานอน",
+    "🇱🇨": "ธง_เซนต์ลูเซีย",
+    "🇱🇮": "ธง_ลิกเตนสไตน์",
+    "🇱🇰": "ธง_ศรีลังกา",
+    "🇱🇷": "ธง_ไลบีเรีย",
+    "🇱🇸": "ธง_เลโซโท",
+    "🇱🇹": "ธง_ลิทัวเนีย",
+    "🇱🇺": "ธง_ลักเซมเบิร์ก",
+    "🇱🇻": "ธง_ลัตเวีย",
+    "🇱🇾": "ธง_ลิเบีย",
+    "🇲🇦": "ธง_โมร็อกโก",
+    "🇲🇨": "ธง_โมนาโก",
+    "🇲🇩": "ธง_มอลโดวา",
+    "🇲🇪": "ธง_มอนเตเนโกร",
+    "🇲🇫": "ธง_เซนต์มาร์ติน",
+    "🇲🇬": "ธง_มาดากัสการ์",
+    "🇲🇭": "ธง_หมู่เกาะมาร์แชลล์",
+    "🇲🇰": "ธง_มาซิโดเนียเหนือ",
+    "🇲🇱": "ธง_มาลี",
+    "🇲🇲": "ธง_เมียนมา_(พม่า)",
+    "🇲🇳": "ธง_มองโกเลีย",
+    "🇲🇴": "ธง_เขตปกครองพิเศษมาเก๊าแห่งสาธารณรัฐประชาชนจีน",
+    "🇲🇵": "ธง_หมู่เกาะนอร์เทิร์นมาเรียนา",
+    "🇲🇶": "ธง_มาร์ตินีก",
+    "🇲🇷": "ธง_มอริเตเนีย",
+    "🇲🇸": "ธง_มอนต์เซอร์รัต",
+    "🇲🇹": "ธง_มอลตา",
+    "🇲🇺": "ธง_มอริเชียส",
+    "🇲🇻": "ธง_มัลดีฟส์",
+    "🇲🇼": "ธง_มาลาวี",
+    "🇲🇽": "ธง_เม็กซิโก",
+    "🇲🇾": "ธง_มาเลเซีย",
+    "🇲🇿": "ธง_โมซัมบิก",
+    "🇳🇦": "ธง_นามิเบีย",
+    "🇳🇨": "ธง_นิวแคลิโดเนีย",
+    "🇳🇪": "ธง_ไนเจอร์",
+    "🇳🇫": "ธง_เกาะนอร์ฟอล์ก",
+    "🇳🇬": "ธง_ไนจีเรีย",
+    "🇳🇮": "ธง_นิการากัว",
+    "🇳🇱": "ธง_เนเธอร์แลนด์",
+    "🇳🇴": "ธง_นอร์เวย์",
+    "🇳🇵": "ธง_เนปาล",
+    "🇳🇷": "ธง_นาอูรู",
+    "🇳🇺": "ธง_นีอูเอ",
+    "🇳🇿": "ธง_นิวซีแลนด์",
+    "🇴🇲": "ธง_โอมาน",
+    "🇵🇦": "ธง_ปานามา",
+    "🇵🇪": "ธง_เปรู",
+    "🇵🇫": "ธง_เฟรนช์โปลินีเซีย",
+    "🇵🇬": "ธง_ปาปัวนิวกินี",
+    "🇵🇭": "ธง_ฟิลิปปินส์",
+    "🇵🇰": "ธง_ปากีสถาน",
+    "🇵🇱": "ธง_โปแลนด์",
+    "🇵🇲": "ธง_แซงปีแยร์_มีเกอลง",
+    "🇵🇳": "ธง_หมู่เกาะพิตแคร์น",
+    "🇵🇷": "ธง_เปอร์โตริโก",
+    "🇵🇸": "ธง_ดินแดนปาเลสไตน์",
+    "🇵🇹": "ธง_โปรตุเกส",
+    "🇵🇼": "ธง_ปาเลา",
+    "🇵🇾": "ธง_ปารากวัย",
+    "🇶🇦": "ธง_กาตาร์",
+    "🇷🇪": "ธง_เรอูนียง",
+    "🇷🇴": "ธง_โรมาเนีย",
+    "🇷🇸": "ธง_เซอร์เบีย",
+    "🇷🇺": "ธง_รัสเซีย",
+    "🇷🇼": "ธง_รวันดา",
+    "🇸🇦": "ธง_ซาอุดีอาระเบีย",
+    "🇸🇧": "ธง_หมู่เกาะโซโลมอน",
+    "🇸🇨": "ธง_เซเชลส์",
+    "🇸🇩": "ธง_ซูดาน",
+    "🇸🇪": "ธง_สวีเดน",
+    "🇸🇬": "ธง_สิงคโปร์",
+    "🇸🇭": "ธง_เซนต์เฮเลนา",
+    "🇸🇮": "ธง_สโลวีเนีย",
+    "🇸🇯": "ธง_สฟาลบาร์_ยานไมเอน",
+    "🇸🇰": "ธง_สโลวะเกีย",
+    "🇸🇱": "ธง_เซียร์ราลีโอน",
+    "🇸🇲": "ธง_ซานมาริโน",
+    "🇸🇳": "ธง_เซเนกัล",
+    "🇸🇴": "ธง_โซมาเลีย",
+    "🇸🇷": "ธง_ซูรินาเม",
+    "🇸🇸": "ธง_ซูดานใต้",
+    "🇸🇹": "ธง_เซาตูเม_ปรินซิปี",
+    "🇸🇻": "ธง_เอลซัลวาดอร์",
+    "🇸🇽": "ธง_ซินต์มาร์เทน",
+    "🇸🇾": "ธง_ซีเรีย",
+    "🇸🇿": "ธง_เอสวาตีนี",
+    "🇹🇦": "ธง_ทริสตันดาคูนา",
+    "🇹🇨": "ธง_หมู่เกาะเติกส์_หมู่เกาะเคคอส",
+    "🇹🇩": "ธง_ชาด",
+    "🇹🇫": "ธง_เฟรนช์เซาเทิร์นเทร์ริทอรีส์",
+    "🇹🇬": "ธง_โตโก",
+    "🇹🇭": "ธง_ไทย",
+    "🇹🇯": "ธง_ทาจิกิสถาน",
+    "🇹🇰": "ธง_โตเกเลา",
+    "🇹🇱": "ธง_ติมอร์-เลสเต",
+    "🇹🇲": "ธง_เติร์กเมนิสถาน",
+    "🇹🇳": "ธง_ตูนิเซีย",
+    "🇹🇴": "ธง_ตองกา",
+    "🇹🇷": "ธง_ตุรกี",
+    "🇹🇹": "ธง_ตรินิแดด_โตเบโก",
+    "🇹🇻": "ธง_ตูวาลู",
+    "🇹🇼": "ธง_ไต้หวัน",
+    "🇹🇿": "ธง_แทนซาเนีย",
+    "🇺🇦": "ธง_ยูเครน",
+    "🇺🇬": "ธง_ยูกันดา",
+    "🇺🇲": "ธง_หมู่เกาะรอบนอกของสหรัฐอเมริกา",
+    "🇺🇳": "ธง_สหประชาชาติ",
+    "🇺🇸": "ธง_สหรัฐอเมริกา",
+    "🇺🇾": "ธง_อุรุกวัย",
+    "🇺🇿": "ธง_อุซเบกิสถาน",
+    "🇻🇦": "ธง_นครวาติกัน",
+    "🇻🇨": "ธง_เซนต์วินเซนต์_เกรนาดีนส์",
+    "🇻🇪": "ธง_เวเนซุเอลา",
+    "🇻🇬": "ธง_หมู่เกาะบริติชเวอร์จิน",
+    "🇻🇮": "ธง_หมู่เกาะเวอร์จินของสหรัฐอเมริกา",
+    "🇻🇳": "ธง_เวียดนาม",
+    "🇻🇺": "ธง_วานูอาตู",
+    "🇼🇫": "ธง_วาลลิส_ฟุตูนา",
+    "🇼🇸": "ธง_ซามัว",
+    "🇽🇰": "ธง_โคโซโว",
+    "🇾🇪": "ธง_เยเมน",
+    "🇾🇹": "ธง_มายอต",
+    "🇿🇦": "ธง_แอฟริกาใต้",
+    "🇿🇲": "ธง_แซมเบีย",
+    "🇿🇼": "ธง_ซิมบับเว",
+    "👁‍🗨": "ตาในลูกโป่งคำพูด",
+    "👨‍🦰": "ผู้ชาย_ผมแดง",
+    "👨‍🦱": "ผู้ชาย_ผมหยิก",
+    "👨‍🦲": "ผู้ชาย_หัวล้าน",
+    "👨‍🦳": "ผู้ชาย_ผมขาว",
+    "👩‍🦰": "ผู้หญิง_ผมแดง",
+    "👩‍🦱": "ผู้หญิง_ผมหยิก",
+    "👩‍🦲": "ผู้หญิง_หัวล้าน",
+    "👩‍🦳": "ผู้หญิง_ผมขาว",
+    "👱‍♀": "ผู้หญิงผมทอง",
+    "👱‍♂": "ผู้ชายผมทอง",
+    "🧑‍🦰": "คน_ผมแดง",
+    "🧑‍🦱": "คน_ผมหยิก",
+    "🧑‍🦲": "คน_หัวล้าน",
+    "🧑‍🦳": "คน_ผมขาว",
+    "💁‍♀": "ผู้หญิงแบมือ",
+    "💁‍♂": "ผู้ชายแบมือ",
+    "🙅‍♀": "ผู้หญิงทำท่าไม่โอเค",
+    "🙅‍♂": "ผู้ชายทำท่าไม่โอเค",
+    "🙆‍♀": "ผู้หญิงทำท่าโอเค",
+    "🙆‍♂": "ผู้ชายทำท่าโอเค",
+    "🙇‍♀": "ผู้หญิงหมอบคำนับ",
+    "🙇‍♂": "ผู้ชายหมอบคำนับ",
+    "🙋‍♀": "ผู้หญิงยกมือ",
+    "🙋‍♂": "ผู้ชายยกมือ",
+    "🙍‍♀": "ผู้หญิงหน้าบึ้ง",
+    "🙍‍♂": "ผู้ชายหน้าบึ้ง",
+    "🙎‍♀": "ผู้หญิงโกรธ",
+    "🙎‍♂": "ผู้ชายโกรธ",
+    "🤦‍♀": "ผู้หญิงเอามือก่ายหน้าผาก",
+    "🤦‍♂": "ผู้ชายเอามือก่ายหน้าผาก",
+    "🤷‍♀": "ผู้หญิงยักไหล่",
+    "🤷‍♂": "ผู้ชายยักไหล่",
+    "🧏‍♀": "ผู้หญิงหูหนวก",
+    "🧏‍♂": "ผู้ชายหูหนวก",
+    "👨‍⚕": "หมอชาย",
+    "👨‍⚖": "ผู้พิพากษาชาย",
+    "👨‍✈": "นักบินชาย",
+    "👨‍🌾": "ชาวนาชาย",
+    "👨‍🍳": "พ่อครัว",
+    "👨‍🍼": "ผู้ชายให้นมลูก",
+    "👨‍🎓": "นักเรียนชาย",
+    "👨‍🎤": "นักร้องชาย",
+    "👨‍🎨": "ศิลปินชาย",
+    "👨‍🏫": "ครูชาย",
+    "👨‍🏭": "พนักงานโรงงานชาย",
+    "👨‍💻": "ผู้เชี่ยวชาญด้านเทคโนโลยีชาย",
+    "👨‍💼": "พนักงานบริษัทชาย",
+    "👨‍🔧": "ช่างซ่อมชาย",
+    "👨‍🔬": "นักวิทยาศาสตร์ชาย",
+    "👨‍🚀": "นักบินอวกาศชาย",
+    "👨‍🚒": "พนักงานดับเพลิงชาย",
+    "👩‍⚕": "หมอหญิง",
+    "👩‍⚖": "ผู้พิพากษาหญิง",
+    "👩‍✈": "นักบินหญิง",
+    "👩‍🌾": "ชาวนาหญิง",
+    "👩‍🍳": "แม่ครัว",
+    "👩‍🍼": "ผู้หญิงให้นมลูก",
+    "👩‍🎓": "นักเรียนหญิง",
+    "👩‍🎤": "นักร้องหญิง",
+    "👩‍🎨": "ศิลปินหญิง",
+    "👩‍🏫": "ครูหญิง",
+    "👩‍🏭": "พนักงานโรงงานหญิง",
+    "👩‍💻": "ผู้เชี่ยวชาญด้านเทคโนโลยีหญิง",
+    "👩‍💼": "พนักงานบริษัทหญิง",
+    "👩‍🔧": "ช่างซ่อมหญิง",
+    "👩‍🔬": "นักวิทยาศาสตร์หญิง",
+    "👩‍🚀": "นักบินอวกาศหญิง",
+    "👩‍🚒": "พนักงานดับเพลิงหญิง",
+    "👮‍♀": "ตำรวจหญิง",
+    "👮‍♂": "ตำรวจชาย",
+    "👰‍♀": "ผู้หญิงที่มีผ้าคลุมหน้า",
+    "👰‍♂": "ผู้ชายที่มีผ้าคลุมหน้า",
+    "👳‍♀": "ผู้หญิงโพกหัว",
+    "👳‍♂": "ผู้ชายโพกหัว",
+    "👷‍♀": "พนักงานก่อสร้างหญิง",
+    "👷‍♂": "พนักงานก่อสร้างชาย",
+    "💂‍♀": "องครักษ์หญิง",
+    "💂‍♂": "องครักษ์ชาย",
+    "🕵‍♀": "นักสืบหญิง",
+    "🕵‍♂": "นักสืบชาย",
+    "🤵‍♀": "ผู้หญิงใส่ทักซิโด้",
+    "🤵‍♂": "ผู้ชายใส่ทักซิโด้",
+    "🧑‍⚕": "หมอ",
+    "🧑‍⚖": "ผู้พิพากษา",
+    "🧑‍✈": "นักบิน",
+    "🧑‍🌾": "ชาวนา",
+    "🧑‍🍳": "กุ๊ก",
+    "🧑‍🍼": "คนให้นมลูก",
+    "🧑‍🎓": "บัณฑิต",
+    "🧑‍🎤": "นักร้อง",
+    "🧑‍🎨": "ศิลปิน",
+    "🧑‍🏫": "ครู",
+    "🧑‍🏭": "พนักงานโรงงาน",
+    "🧑‍💻": "ผู้เชี่ยวชาญด้านเทคโนโลยี",
+    "🧑‍💼": "พนักงานออฟฟิศ",
+    "🧑‍🔧": "ช่างกล",
+    "🧑‍🔬": "นักวิทยาศาสตร์",
+    "🧑‍🚀": "นักบินอวกาศ",
+    "🧑‍🚒": "พนักงานดับเพลิง",
+    "🦸‍♀": "ยอดหญิง",
+    "🦸‍♂": "พระเอก",
+    "🦹‍♀": "นางร้าย",
+    "🦹‍♂": "ตัวโกง",
+    "🧑‍🎄": "ซานตาคลอส",
+    "🧙‍♀": "แม่มด",
+    "🧙‍♂": "พ่อมด",
+    "🧚‍♀": "เทพธิดา",
+    "🧚‍♂": "เทพบุตร",
+    "🧛‍♀": "แวมไพร์ผู้หญิง",
+    "🧛‍♂": "แวมไพร์ผู้ชาย",
+    "🧜‍♀": "เงือก",
+    "🧜‍♂": "เงือกชาย",
+    "🧝‍♀": "เอลฟ์ผู้หญิง",
+    "🧝‍♂": "เอลฟ์ผู้ชาย",
+    "🧞‍♀": "ยักษ์จีนี่หญิง",
+    "🧞‍♂": "ยักษ์จีนี่ชาย",
+    "🧟‍♀": "ซอมบี้ผู้หญิง",
+    "🧟‍♂": "ซอมบี้ผู้ชาย",
+    "🏃‍♀": "ผู้หญิงวิ่ง",
+    "🏃‍♂": "ผู้ชายวิ่ง",
+    "👨‍🦯": "ผู้ชายเดินถือไม้เท้านำทาง",
+    "👨‍🦼": "ผู้ชายนั่งวีลแชร์ไฟฟ้า",
+    "👨‍🦽": "ผู้ชายนั่งวีลแชร์ธรรมดา",
+    "👩‍🦯": "ผู้หญิงเดินถือไม้เท้านำทาง",
+    "👩‍🦼": "ผู้หญิงนั่งวีลแชร์ไฟฟ้า",
+    "👩‍🦽": "ผู้หญิงนั่งวีลแชร์ธรรมดา",
+    "👯‍♀": "ผู้หญิงในชุดหูกระต่าย",
+    "👯‍♂": "ผู้ชายในชุดหูกระต่าย",
+    "💆‍♀": "ผู้หญิงกำลังนวดหน้า",
+    "💆‍♂": "ผู้ชายกำลังนวดหน้า",
+    "💇‍♀": "ผู้หญิงกำลังตัดผม",
+    "💇‍♂": "ผู้ชายกำลังตัดผม",
+    "🚶‍♀": "ผู้หญิงเดิน",
+    "🚶‍♂": "ผู้ชายเดิน",
+    "🧍‍♀": "ผู้หญิงกำลังยืน",
+    "🧍‍♂": "ผู้ชายกำลังยืน",
+    "🧎‍♀": "ผู้หญิงกำลังคุกเข่า",
+    "🧎‍♂": "ผู้ชายกำลังคุกเข่า",
+    "🧑‍🦯": "คนเดินถือไม้เท้านำทาง",
+    "🧑‍🦼": "คนนั่งวีลแชร์ไฟฟ้า",
+    "🧑‍🦽": "คนนั่งวีลแชร์ธรรมดา",
+    "🧖‍♀": "ผู้หญิงในห้องอบไอน้ำ",
+    "🧖‍♂": "ผู้ชายในห้องอบไอน้ำ",
+    "🧗‍♀": "ผู้หญิงไต่เขา",
+    "🧗‍♂": "ผู้ชายไต่เขา",
+    "⛹‍♀": "ผู้หญิงเล่นบอล",
+    "⛹‍♂": "ผู้ชายเล่นบอล",
+    "🏄‍♀": "ผู้หญิงโต้คลื่น",
+    "🏄‍♂": "ผู้ชายโต้คลื่น",
+    "🏊‍♀": "ผู้หญิงว่ายน้ำ",
+    "🏊‍♂": "ผู้ชายว่ายน้ำ",
+    "🏋‍♀": "ผู้หญิงยกน้ำหนัก",
+    "🏋‍♂": "ผู้ชายยกน้ำหนัก",
+    "🏌‍♀": "ผู้หญิงตีกอล์ฟ",
+    "🏌‍♂": "ผู้ชายตีกอล์ฟ",
+    "🚣‍♀": "ผู้หญิงพายเรือ",
+    "🚣‍♂": "ผู้ชายพายเรือ",
+    "🚴‍♀": "ผู้หญิงปั่นจักรยาน",
+    "🚴‍♂": "ผู้ชายปั่นจักรยาน",
+    "🚵‍♀": "ผู้หญิงปั่นจักรยานเสือภูเขา",
+    "🚵‍♂": "ผู้ชายปั่นจักรยานเสือภูเขา",
+    "🤸‍♀": "ผู้หญิงตีลังกา",
+    "🤸‍♂": "ผู้ชายตีลังกา",
+    "🤹‍♀": "ผู้หญิงเล่นจั๊กกลิ้ง",
+    "🤹‍♂": "ผู้ชายเล่นจั๊กกลิ้ง",
+    "🤼‍♀": "ผู้หญิงเล่นมวยปล้ำ",
+    "🤼‍♂": "ผู้ชายเล่นมวยปล้ำ",
+    "🤽‍♀": "ผู้หญิงเล่นโปโลน้ำ",
+    "🤽‍♂": "ผู้ชายเล่นโปโลน้ำ",
+    "🤾‍♀": "ผู้หญิงเล่นแฮนด์บอล",
+    "🤾‍♂": "ผู้ชายเล่นแฮนด์บอล",
+    "🧘‍♀": "ผู้หญิงนั่งสมาธิ",
+    "🧘‍♂": "ผู้ชายนั่งสมาธิ",
+    "👨‍👦": "ครอบครัว_ผู้ชาย_เด็กชาย",
+    "👨‍👧": "ครอบครัว_ผู้ชาย_เด็กหญิง",
+    "👩‍👦": "ครอบครัว_ผู้หญิง_เด็กชาย",
+    "👩‍👧": "ครอบครัว_ผู้หญิง_เด็กหญิง",
+    "🐈‍⬛": "แมวดำ",
+    "🐕‍🦺": "สุนัขบริการ",
+    "🐻‍❄": "หมีขั้วโลก",
+    "🏳‍⚧": "ธงคนข้ามเพศ",
+    "🏳‍🌈": "ธงสีรุ้ง",
+    "🏴‍☠": "ธงโจรสลัด",
+    "👨‍❤‍👨": "คู่รัก_ผู้ชาย_ผู้ชาย",
+    "👨‍👦‍👦": "ครอบครัว_ผู้ชาย_เด็กชาย_เด็กชาย",
+    "👨‍👧‍👦": "ครอบครัว_ผู้ชาย_เด็กหญิง_เด็กชาย",
+    "👨‍👧‍👧": "ครอบครัว_ผู้ชาย_เด็กหญิง_เด็กหญิง",
+    "👨‍👨‍👦": "ครอบครัว_ผู้ชาย_ผู้ชาย_เด็กชาย",
+    "👨‍👨‍👧": "ครอบครัว_ผู้ชาย_ผู้ชาย_เด็กหญิง",
+    "👨‍👩‍👦": "ครอบครัว_ผู้ชาย_ผู้หญิง_เด็กชาย",
+    "👨‍👩‍👧": "ครอบครัว_ผู้ชาย_ผู้หญิง_เด็กหญิง",
+    "👩‍❤‍👨": "คู่รัก_ผู้หญิง_ผู้ชาย",
+    "👩‍❤‍👩": "คู่รัก_ผู้หญิง_ผู้หญิง",
+    "👩‍👦‍👦": "ครอบครัว_ผู้หญิง_เด็กชาย_เด็กชาย",
+    "👩‍👧‍👦": "ครอบครัว_ผู้หญิง_เด็กหญิง_เด็กชาย",
+    "👩‍👧‍👧": "ครอบครัว_ผู้หญิง_เด็กหญิง_เด็กหญิง",
+    "👩‍👩‍👦": "ครอบครัว_ผู้หญิง_ผู้หญิง_เด็กชาย",
+    "👩‍👩‍👧": "ครอบครัว_ผู้หญิง_ผู้หญิง_เด็กหญิง",
+    "🧑‍🤝‍🧑": "คนจับมือกัน",
+    "👨‍❤‍💋‍👨": "จูบ_ผู้ชาย_ผู้ชาย",
+    "👨‍👨‍👦‍👦": "ครอบครัว_ผู้ชาย_ผู้ชาย_เด็กชาย_เด็กชาย",
+    "👨‍👨‍👧‍👦": "ครอบครัว_ผู้ชาย_ผู้ชาย_เด็กหญิง_เด็กชาย",
+    "👨‍👨‍👧‍👧": "ครอบครัว_ผู้ชาย_ผู้ชาย_เด็กหญิง_เด็กหญิง",
+    "👨‍👩‍👦‍👦": "ครอบครัว_ผู้ชาย_ผู้หญิง_เด็กชาย_เด็กชาย",
+    "👨‍👩‍👧‍👦": "ครอบครัว_ผู้ชาย_ผู้หญิง_เด็กหญิง_เด็กชาย",
+    "👨‍👩‍👧‍👧": "ครอบครัว_ผู้ชาย_ผู้หญิง_เด็กหญิง_เด็กหญิง",
+    "👩‍❤‍💋‍👨": "จูบ_ผู้หญิง_ผู้ชาย",
+    "👩‍❤‍💋‍👩": "จูบ_ผู้หญิง_ผู้หญิง",
+    "👩‍👩‍👦‍👦": "ครอบครัว_ผู้หญิง_ผู้หญิง_เด็กชาย_เด็กชาย",
+    "👩‍👩‍👧‍👦": "ครอบครัว_ผู้หญิง_ผู้หญิง_เด็กหญิง_เด็กชาย",
+    "👩‍👩‍👧‍👧": "ครอบครัว_ผู้หญิง_ผู้หญิง_เด็กหญิง_เด็กหญิง",
+    "🏴󠁧󠁢󠁥󠁮󠁧󠁿": "ธง_อังกฤษ",
+    "🏴󠁧󠁢󠁳󠁣󠁴󠁿": "ธง_สกอตแลนด์",
+    "🏴󠁧󠁢󠁷󠁬󠁳󠁿": "ธง_เวลส์",
+}
+
+_th_emoji = {v: k for k, v in _emoji_th.items()}
+
+_emojis = sorted(_emoji_th.keys(), key=len, reverse=True)
+_emoji_regex = re.compile("|".join(map(re.escape, _emojis)))
+_delimiter = ":"
+
+
+
+[docs] +def emoji_to_thai(text: str, delimiters=(_delimiter, _delimiter)) -> str: + """ + This function converts emojis to their Thai meanings + + :param str text: Text with emojis + :return: Text with emojis converted to their Thai meanings + :rtype: str + + :Example: + :: + + from pythainlp.util import emoji_to_thai + + emoji_to_thai("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀") + # output: จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ + ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน: + + emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣") + # output: หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ: + + emoji_to_thai("🇹🇭 นี่คือธงประเทศไทย") + # output: :ธง_ไทย: นี่คือธงประเทศไทย + """ + + return _emoji_regex.sub( + lambda match: delimiters[0] + + _emoji_th[match.group(0)] + + delimiters[1], + text, + )
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/encoding.html b/5.1/_modules/pythainlp/util/encoding.html new file mode 100644 index 0000000..1263451 --- /dev/null +++ b/5.1/_modules/pythainlp/util/encoding.html @@ -0,0 +1,184 @@ + + + + + + + + pythainlp.util.encoding — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.encoding

+# -*- coding_utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+[docs] +def tis620_to_utf8(text: str)->str: + """ + Convert TIS-620 to UTF-8 + + :param str text: TIS-620 encoded text + :return: UTF-8 encoded text + :rtype: str + + :Example: + :: + + from pythainlp.util import tis620_to_utf8 + + tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ") + # output: 'กระทรวงอุตสาหกรรม' + """ + return text.encode("cp1252", "ignore").decode("tis-620")
+ + + +
+[docs] +def to_idna(text: str) -> str: + """ + Encode text with IDNA, as used in Internationalized Domain Name (IDN). + + :param str text: Thai text + :return: IDNA-encoded text + :rtype: str + + :Example: + :: + + from pythainlp.util import to_idna + + to_idna("คนละครึ่ง.com") + # output: 'xn--42caj4e6bk1f5b1j.com' + """ + return text.encode("idna").decode("utf-8")
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/keyboard.html b/5.1/_modules/pythainlp/util/keyboard.html new file mode 100644 index 0000000..ef24b8f --- /dev/null +++ b/5.1/_modules/pythainlp/util/keyboard.html @@ -0,0 +1,371 @@ + + + + + + + + pythainlp.util.keyboard — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.keyboard

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Functions related to keyboard layout.
+"""
+
+EN_TH_KEYB_PAIRS = {
+    "Z": "(",
+    "z": "ผ",
+    "X": ")",
+    "x": "ป",
+    "C": "ฉ",
+    "c": "แ",
+    "V": "ฮ",
+    "v": "อ",
+    "B": "\u0e3a",  # พินทุ
+    "b": "\u0e34",  # สระอุ
+    "N": "\u0e4c",  # การันต์
+    "n": "\u0e37",  # สระอือ
+    "M": "?",
+    "m": "ท",
+    "<": "ฒ",
+    ",": "ม",
+    ">": "ฬ",
+    ".": "ใ",
+    "?": "ฦ",
+    "/": "ฝ",
+    "A": "ฤ",
+    "a": "ฟ",
+    "S": "ฆ",
+    "s": "ห",
+    "D": "ฏ",
+    "d": "ก",
+    "F": "โ",
+    "f": "ด",
+    "G": "ฌ",
+    "g": "เ",
+    "H": "\u0e47",  # ไม้ไต่คู้
+    "h": "\u0e49",  # ไม้โท
+    "J": "\u0e4b",  # ไม้จัตวา
+    "j": "\u0e48",  # ไม้เอก
+    "K": "ษ",
+    "k": "า",
+    "L": "ศ",
+    "l": "ส",
+    ":": "ซ",
+    ";": "ว",
+    '"': ".",
+    "'": "ง",
+    "Q": "๐",
+    "q": "ๆ",
+    "W": '"',
+    "w": "ไ",
+    "E": "ฎ",
+    "e": "\u0e33",  # สระอำ
+    "R": "ฑ",
+    "r": "พ",
+    "T": "ธ",
+    "t": "ะ",
+    "Y": "\u0e4d",  # นิคหิต
+    "y": "\u0e31",  # ไม้หันอากาศ
+    "U": "\u0e4a",  # ไม้ตรี
+    "u": "\u0e35",  # สระอ ี
+    "I": "ณ",
+    "i": "ร",
+    "O": "ฯ",
+    "o": "น",
+    "P": "ญ",
+    "p": "ย",
+    "{": "ฐ",
+    "[": "บ",
+    "}": ",",
+    "]": "ล",
+    "|": "ฅ",
+    "\\": "ฃ",
+    "~": "%",
+    "`": "_",
+    "@": "๑",
+    "2": "/",
+    "#": "๒",
+    "3": "-",
+    "$": "๓",
+    "4": "ภ",
+    "%": "๔",
+    "5": "ถ",
+    "^": "\u0e39",  # สระอู
+    "6": "\u0e38",  # สระอุ
+    "&": "฿",
+    "7": "\u0e36",  # สระอึ
+    "*": "๕",
+    "8": "ค",
+    "(": "๖",
+    "9": "ต",
+    ")": "๗",
+    "0": "จ",
+    "_": "๘",
+    "-": "ข",
+    "+": "๙",
+    "=": "ช",
+}
+
+TH_EN_KEYB_PAIRS = {v: k for k, v in EN_TH_KEYB_PAIRS.items()}
+
+EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS)
+TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS)
+
+TIS_820_2531_MOD = [
+    ["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"],
+    ["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"],
+    ["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"],
+    ["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"],
+]
+TIS_820_2531_MOD_SHIFT = [
+    ["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"],
+    ["๐", '"', "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"],
+    ["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."],
+    ["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"],
+]
+
+
+
+[docs] +def eng_to_thai(text: str) -> str: + """ + Corrects the given text that was incorrectly typed using English-US + Qwerty keyboard layout to the originally intended keyboard layout + that is the Thai Kedmanee keyboard. + + :param str text: incorrect text input (Thai typed using English keyboard) + :return: Thai text with typing using + incorrect keyboard layout is corrected + :rtype: str + + :Example: + + Intentionally type "ธนาคารแห่งประเทศไทย", but got "Tok8kicsj'xitgmLwmp":: + + from pythainlp.util import eng_to_thai + + eng_to_thai("Tok8kicsj'xitgmLwmp") + # output: ธนาคารแห่งประเทศไทย + """ + return text.translate(EN_TH_TRANSLATE_TABLE)
+ + + +
+[docs] +def thai_to_eng(text: str) -> str: + """ + Corrects the given text that was incorrectly typed using Thai Kedmanee + keyboard layout to the originally intended keyboard layout + that is the English-US Qwerty keyboard. + + :param str text: incorrect text input (English typed using Thai keyboard) + :return: English text with typing with + incorrect keyboard layout is corrected + :rtype: str + + :Example: + + Intentionally type "Bank of Thailand", but got "ฺฟืา นด ธ้ฟรสฟืก":: + + from pythainlp.util import eng_to_thai + + thai_to_eng("ฺฟืา นด ธ้ฟรสฟืก") + # output: 'Bank of Thailand' + """ + return text.translate(TH_EN_TRANSLATE_TABLE)
+ + + +def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: + """ + Calculate Euclidean distance between two Thai characters + according to their location on a Thai keyboard layout. + + A modified TIS 820-2531 standard keyboard layout, which is developed + from Kedmanee layout and is the most commonly used Thai keyboard layout, + is used in distance calculation. + + The modified TIS 820-2531 is TIS 820-2531 with few key extensions + proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in + https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html + + Noted that the latest TIS 820-2538 has slight changes in layout from + TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in + https://www.nectec.or.th/it-standards/std820/std820.html + Since TIS 820-2538 is not widely adopted by keyboard manufacturer, + this function uses the de facto standard modified TIS 820-2531 instead. + + :param str c1: first character + :param str c2: second character + :param str shift_dist: return value if they're shifted + :return: Euclidean distance between two characters + :rtype: float + + :Example: + + from pythainlp.util import thai_keyboard_dist + thai_keyboard_dist("ด", "ะ") + # output: 1.4142135623730951 + thai_keyboard_dist("ฟ", "ฤ") + # output: 0.0 + thai_keyboard_dist("ฟ", "ห") + # output: 1.0 + thai_keyboard_dist("ฟ", "ก") + # output: 2.0 + thai_keyboard_dist("ฟ", "ฤ", 0.5) + # output: 0.5 + """ + + def get_char_coord( + ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT] + ): + for layout in layouts: + for row in layout: + if ch in row: + r = layout.index(row) + c = row.index(ch) + return (r, c) + raise ValueError(ch + " not found in given keyboard layout") + + coord1 = get_char_coord(c1) + coord2 = get_char_coord(c2) + distance = ( + (coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2 + ) ** (0.5) + if distance == 0 and c1 != c2: + return shift_dist + return distance +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/keywords.html b/5.1/_modules/pythainlp/util/keywords.html new file mode 100644 index 0000000..ba6c19d --- /dev/null +++ b/5.1/_modules/pythainlp/util/keywords.html @@ -0,0 +1,248 @@ + + + + + + + + pythainlp.util.keywords — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.keywords

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from collections import Counter
+from typing import Dict, List
+
+from pythainlp.corpus import thai_stopwords
+
+_STOPWORDS = thai_stopwords()
+
+
+
+[docs] +def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: + """ + Count word frequencies given a list of Thai words with an option + to exclude stopwords. + + :param list words: a list of words + :param bool exclude_stopwords: If this parameter is set to **True**, + exclude stopwords from counting. + Otherwise, the stopwords will be counted. + By default, `exclude_stopwords`is + set to **False** + :return: a Counter object representing word frequencies in the text + :rtype: :class:`collections.Counter` + + :Example: + + Include stopwords when counting word frequencies:: + + from pythainlp.util import rank + + words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \\ + "เป็น", " ", "ลายลักษณ์อักษร"] + + rank(words) + # output: + # Counter( + # { + # ' ': 2, + # 'การ': 1, + # 'บันทึก': 2, + # 'มี': 1, + # 'ลายลักษณ์อักษร': 1, + # 'เป็น': 1, + # 'เหตุการณ์': 1 + # }) + + Exclude stopwords when counting word frequencies:: + + from pythainlp.util import rank + + words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \\ + "เป็น", " ", "ลายลักษณ์อักษร"] + + rank(words) + # output: + # Counter( + # { + # ' ': 2, + # 'บันทึก': 2, + # 'ลายลักษณ์อักษร': 1, + # 'เหตุการณ์': 1 + # }) + """ + if not words: + return None + + if exclude_stopwords: + words = [word for word in words if word not in _STOPWORDS] + + return Counter(words)
+ + + +
+[docs] +def find_keyword(word_list: List[str], min_len: int = 3) -> Dict[str, int]: + """ + This function counts the frequencies of words in the list + where stopword is excluded and returns a frequency dictionary. + + :param list word_list: a list of words + :param int min_len: the minimum frequency for words to be retained + + :return: a dictionary object with key-value pair being words and their raw counts + :rtype: dict[str, int] + + :Example: + :: + + from pythainlp.util import find_keyword + + words = ["บันทึก", "เหตุการณ์", "บันทึก", "เหตุการณ์", + " ", "มี", "การ", "บันทึก", "เป็น", " ", "ลายลักษณ์อักษร" + "และ", "การ", "บันทึก","เสียง","ใน","เหตุการณ์"] + + find_keyword(words) + # output: {'บันทึก': 4, 'เหตุการณ์': 3} + + find_keyword(words, min_len=1) + # output: {' ': 2, 'บันทึก': 4, 'ลายลักษณ์อักษรและ': 1, + 'เสียง': 1, 'เหตุการณ์': 3} + """ + word_list = rank(word_list, exclude_stopwords=True) + + return {k: v for k, v in word_list.items() if v >= min_len}
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/lcs.html b/5.1/_modules/pythainlp/util/lcs.html new file mode 100644 index 0000000..888f808 --- /dev/null +++ b/5.1/_modules/pythainlp/util/lcs.html @@ -0,0 +1,208 @@ + + + + + + + + pythainlp.util.lcs — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.lcs

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+
+[docs] +def longest_common_subsequence(str1: str, str2: str) -> str: + """ + Find the longest common subsequence between two strings. + + :param str str1: The first string. + :param str str2: The second string. + :return: The longest common subsequence. + :rtype: str + + :Example: + :: + + from pythainlp.util.lcs import longest_common_subsequence + + print(longest_common_subsequence("ABCBDAB", "BDCAB")) + # output: "BDAB" + """ + m = len(str1) + n = len(str2) + + # Create a 2D array to store lengths of longest common subsequence. + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Build the dp array from bottom up. + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif str1[i - 1] == str2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + # Following code is used to print LCS + index = dp[m][n] + + # Create a character array to store the lcs string + lcs = [""] * (index + 1) + lcs[index] = "" + + # Start from the right-most-bottom-most corner and + # one by one store characters in lcs[] + i = m + j = n + while i > 0 and j > 0: + + # If current character in str1 and str2 are same, then + # current character is part of LCS + if str1[i - 1] == str2[j - 1]: + lcs[index - 1] = str1[i - 1] + i -= 1 + j -= 1 + index -= 1 + + # If not same, then find the larger of two and + # go in the direction of larger value + elif dp[i - 1][j] > dp[i][j - 1]: + i -= 1 + else: + j -= 1 + + return "".join(lcs)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/morse.html b/5.1/_modules/pythainlp/util/morse.html new file mode 100644 index 0000000..160a41a --- /dev/null +++ b/5.1/_modules/pythainlp/util/morse.html @@ -0,0 +1,344 @@ + + + + + + + + pythainlp.util.morse — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.morse

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+THAI_MORSE_CODE = {
+    "ก": "--.",
+    "ข": "-.-.",
+    "ค": "-.-",
+    "ฆ": "-.-",
+    "ง": "-.--.",
+    "จ": "-..-.",
+    "ฉ": "----",
+    "ช": "-..-",
+    "ฌ": "-..-",
+    "ซ": "--..",
+    "ญ": ".---",
+    "ด": "-..",
+    "ถ": "-.-..",
+    "ฐ": "-.-..",
+    "ฑ": "-..--",
+    "ฒ": "-..--",
+    "ท": "-..--",
+    "ธ": "-..--",
+    "ณ": "-.",
+    "น": "-.",
+    "บ": "-...",
+    "ป": ".--.",
+    "ผ": "--.-",
+    "ฝ": "-.-.-",
+    "พ": ".--..",
+    "ภ": ".--..",
+    "ฟ": "..-.",
+    "ม": "--",
+    "ย": "-.--",
+    "ร": ".-.",
+    "ล": ".-..",
+    "ฬ": ".-..",
+    "ว": ".--",
+    "ศ": "...",
+    "ษ": "...",
+    "ส": "...",
+    "ห": "....",
+    "ฮ": "--.--",
+    "ฎ": "-..",
+    "ต": "-",
+    "ฏ": "-",
+    "ฤ": ".-.--",
+    "่": "..-",
+    "้": "...-",
+    "๊": "--...",
+    "๋": ".-.-.",
+    "ั": ".--.-",
+    "็": "---..",
+    "์": "--..-",
+    "ั้": ".---.",
+    "ฯ": "--.-.",
+    "ฯลฯ": "---.-",
+    "ๆ": "---.-",
+    "ะ": ".-...",
+    "า": ".-",
+    "ิ": "..-..",
+    "ี": "..",
+    "ึ": "..--.",
+    "ื": "..--",
+    "ุ": "..-.-",
+    "ู": "---.",
+    "เ": ".",
+    "แ": ".-.-",
+    "โ": "---",
+    "ไ": ".-..-",
+    "ใ": ".-..-",
+    "ำ": "...-.",
+    "อ": "-...-",
+}
+
+ENGLISH_MORSE_CODE = {
+    "A": ".-",
+    "B": "-...",
+    "C": "-.-.",
+    "D": "-..",
+    "E": ".",
+    "F": "..-.",
+    "G": "--.",
+    "H": "....",
+    "I": "..",
+    "J": ".---",
+    "K": "-.-",
+    "L": ".-..",
+    "M": "--",
+    "N": "-.",
+    "O": "---",
+    "P": ".--.",
+    "Q": "--.-",
+    "R": ".-.",
+    "S": "...",
+    "T": "-",
+    "U": "..-",
+    "V": "...-",
+    "W": ".--",
+    "X": "-..-",
+    "Y": "-.--",
+    "Z": "--..",
+    "0": "-----",
+    ",": "--..--",
+    "1": ".----",
+    ".": ".-.-.-",
+    "2": "..---",
+    "?": "..--..",
+    "3": "...--",
+    ";": "-.-.-.",
+    "4": "....-",
+    ":": "---...",
+    "5": ".....",
+    "'": ".----.",
+    "6": "-....",
+    "-": "-....-",
+    "7": "--...",
+    "/": "-..-.",
+    "8": "---..",
+    "(": "-.--.-",
+}
+
+decodingeng = {}
+for key, val in ENGLISH_MORSE_CODE.items():
+    decodingeng[val] = key
+
+decodingthai = {}
+for key, val in THAI_MORSE_CODE.items():
+    decodingthai[val.replace(" ", "")] = key
+
+for key, val in THAI_MORSE_CODE.items():
+    THAI_MORSE_CODE[key] = val.replace(" ", "")
+
+
+
+[docs] +def morse_encode(text: str, lang: str = "th") -> str: + """ + Convert text to Morse code (support Thai and English) + + :param str text: Text + :param str lang: Language Code (*th* is Thai and *en* is English) + :return: Morse code + :rtype: str + + :Example: + :: + + from pythainlp.util.morse import morse_encode + print(morse_encode("แมว", lang="th")) + # output: .-.- -- .-- + + print(morse_encode("cat", lang="en")) + # output: -.-. .- - + """ + if lang == "th": # Thai + return " ".join( + map(lambda x, g=THAI_MORSE_CODE.get: g(x, " "), text.upper()) + ) + elif lang == "en": # English + return " ".join( + map(lambda x, g=ENGLISH_MORSE_CODE.get: g(x, " "), text.upper()) + ) + else: + raise NotImplementedError(f"This function doesn't support {lang}.")
+ + + +
+[docs] +def morse_decode(morse_text: str, lang: str = "th") -> str: + """ + Simple Convert Morse code to text + + Thai still have some wrong character problem that\ + can fix by spell corrector. + + :param str morse_text: Morse code + :param str lang: Language Code (*th* is Thai and *en* is English) + :return: Text + :rtype: str + + :Example: + :: + + from pythainlp.util.morse import morse_decode + print(morse_decode(".-.- -- .--", lang="th")) + # output: แมว + + print(morse_decode("-.-. .- -", lang="en")) + # output: CAT + """ + if lang == "th": + ans = "".join( + map(lambda x, g=decodingthai.get: g(x, ""), morse_text.split(" ")) + ) + return "".join(ans.split()) + elif lang == "en": + ans = "".join( + map(lambda x, g=decodingeng.get: g(x, " "), morse_text.split(" ")) + ) + return " ".join(ans.split()) + else: + raise NotImplementedError(f"This function doesn't support {lang}.")
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/normalize.html b/5.1/_modules/pythainlp/util/normalize.html new file mode 100644 index 0000000..ef2fa8b --- /dev/null +++ b/5.1/_modules/pythainlp/util/normalize.html @@ -0,0 +1,504 @@ + + + + + + + + pythainlp.util.normalize — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.normalize

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Text normalization
+"""
+
+import re
+from typing import List, Union
+
+from pythainlp import thai_above_vowels as above_v
+from pythainlp import thai_below_vowels as below_v
+from pythainlp import thai_follow_vowels as follow_v
+from pythainlp import thai_lead_vowels as lead_v
+from pythainlp import thai_tonemarks as tonemarks
+from pythainlp.tokenize import word_tokenize
+from pythainlp.tools import warn_deprecation
+
+_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
+_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")
+
+_ZERO_WIDTH_CHARS = "\u200b\u200c"  # ZWSP, ZWNJ
+
+_REORDER_PAIRS = [
+    ("\u0e40\u0e40", "\u0e41"),  # Sara E + Sara E -> Sara Ae
+    (
+        f"([{tonemarks}\u0e4c]+)([{above_v}{below_v}]+)",
+        "\\2\\1",
+    ),  # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
+    (
+        f"\u0e4d([{tonemarks}]*)\u0e32",
+        "\\1\u0e33",
+    ),  # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
+    (
+        f"([{follow_v}]+)([{tonemarks}]+)",
+        "\\2\\1",
+    ),  # FOLLOW VOWEL + TONEMARK+ -> TONEMARK + FOLLOW VOWEL
+    ("([^\u0e24\u0e26])\u0e45", "\\1\u0e32"),  # Lakkhangyao -> Sara Aa
+]
+
+# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
+_NOREPEAT_CHARS = (
+    f"{follow_v}{lead_v}{above_v}{below_v}\u0e3a\u0e4c\u0e4d\u0e4e"
+)
+_NOREPEAT_PAIRS = list(
+    zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
+)
+
+_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")
+
+_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
+
+
+def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
+    return matchobj.group(0)[-1]
+
+
+
+[docs] +def remove_dangling(text: str) -> str: + """ + Remove Thai non-base characters at the beginning of text. + + This is a common "typo", especially for input field in a form, + as these non-base characters can be visually hidden from user + who may accidentally typed them in. + + A character to be removed should be both: + + * tone mark, above vowel, below vowel, or non-base sign AND + * located at the beginning of the text + + :param str text: input text + :return: text without dangling Thai characters at the beginning + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_dangling + + remove_dangling("๊ก") + # output: 'ก' + """ + return _RE_REMOVE_DANGLINGS.sub("", text)
+ + + +
+[docs] +def remove_dup_spaces(text: str) -> str: + """ + Remove duplicate spaces. Replace multiple spaces with one space. + + Multiple newline characters and empty lines will be replaced + with one newline character. + + :param str text: input text + :return: text without duplicated spaces and newlines + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_dup_spaces + + remove_dup_spaces("ก ข ค") + # output: 'ก ข ค' + """ + while " " in text: + text = text.replace(" ", " ") + text = _RE_REMOVE_NEWLINES.sub("\n", text) + text = text.strip() + return text
+ + + +
+[docs] +def remove_tonemark(text: str) -> str: + """ + Remove all Thai tone marks from the text. + + Thai script has four tone marks indicating four tones as follows: + + * Down tone (Thai: ไม้เอก _่ ) + * Falling tone (Thai: ไม้โท _้ ) + * High tone (Thai: ไม้ตรี _๊ ) + * Rising tone (Thai: ไม้จัตวา _๋ ) + + Putting wrong tone mark is a common mistake in Thai writing. + By removing tone marks from the string, it could be used to + for a approximate string matching. + + :param str text: input text + :return: text without Thai tone marks + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_tonemark + + remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด") + # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด + """ + for ch in tonemarks: + while ch in text: + text = text.replace(ch, "") + return text
+ + + +
+[docs] +def remove_zw(text: str) -> str: + """ + Remove zero-width characters. + + These non-visible characters may cause unexpected result from the + user's point of view. Removing them can make string matching more robust. + + Characters to be removed: + + * Zero-width space (ZWSP) + * Zero-width non-joiner (ZWJP) + + :param str text: input text + :return: text without zero-width characters + :rtype: str + """ + for ch in _ZERO_WIDTH_CHARS: + while ch in text: + text = text.replace(ch, "") + + return text
+ + + +
+[docs] +def reorder_vowels(text: str) -> str: + """ + Reorder vowels and tone marks to the standard logical order/spelling. + + Characters in input text will be reordered/transformed, + according to these rules: + + * Sara E + Sara E -> Sara Ae + * Nikhahit + Sara Aa -> Sara Am + * tone mark + non-base vowel -> non-base vowel + tone mark + * follow vowel + tone mark -> tone mark + follow vowel + + :param str text: input text + :return: text with vowels and tone marks in the standard logical order + :rtype: str + """ + for pair in _REORDER_PAIRS: + text = re.sub(pair[0], pair[1], text) + + return text
+ + + +
+[docs] +def remove_repeat_vowels(text: str) -> str: + """ + Remove repeating vowels, tone marks, and signs. + + This function will call reorder_vowels() first, to make sure that + double Sara E will be converted to Sara Ae and not be removed. + + :param str text: input text + :return: text without repeating Thai vowels, tone marks, and signs + :rtype: str + """ + text = reorder_vowels(text) + for pair in _NOREPEAT_PAIRS: + text = re.sub(pair[0], pair[1], text) + + # remove repeating tone marks, use last tone mark + text = _RE_TONEMARKS.sub(_last_char, text) + + return text
+ + + +
+[docs] +def normalize(text: str) -> str: + """ + Normalize and clean Thai text with normalizing rules as follows: + + * Remove zero-width spaces + * Remove duplicate spaces + * Reorder tone marks and vowels to standard order/spelling + * Remove duplicate vowels and signs + * Remove duplicate tone marks + * Remove dangling non-base characters at the beginning of text + + normalize() simply call remove_zw(), remove_dup_spaces(), + remove_repeat_vowels(), and remove_dangling(), in that order. + + If a user wants to customize the selection or the order of rules + to be applied, they can choose to call those functions by themselves. + + Note: for Unicode normalization, see unicodedata.normalize(). + + :param str text: input text + :return: normalized text according to the rules + :rtype: str + + :Example: + :: + + from pythainlp.util import normalize + + normalize("เเปลก") # starts with two Sara E + # output: แปลก + + normalize("นานาาา") + # output: นานา + """ + text = remove_zw(text) + text = remove_dup_spaces(text) + text = remove_repeat_vowels(text) + text = remove_dangling(text) + + return text
+ + + +def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]: + """ + Expand Maiyamok. + + Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word + repetition. This function preprocesses Thai text by replacing + Maiyamok with a word being repeated. + + :param Union[str, List[str]] sent: sentence (list or string) + :return: list of words + :rtype: List[str] + + :Example: + :: + from pythainlp.util import expand_maiyamok + + expand_maiyamok("คนๆนก") + # output: ['คน', 'คน', 'นก'] + """ + if isinstance(sent, str): + sent = word_tokenize(sent) + + yamok = "ๆ" + + # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน" + re_yamok = re.compile(rf"({yamok})") + temp_toks: list[str] = [] + for token in sent: + toks = re_yamok.split(token) + toks = [tok for tok in toks if tok] # remove empty string ("") + temp_toks.extend(toks) + sent = temp_toks + del temp_toks + + output_toks: list[str] = [] + yamok_count = 0 + len_sent = len(sent) + for i in range(len_sent - 1, -1, -1): # do it backward + if yamok_count == 0 or (i + 1 >= len_sent): + if sent[i] == yamok: + yamok_count = yamok_count + 1 + else: + output_toks.append(sent[i]) + continue + + if sent[i] == yamok: + yamok_count = yamok_count + 1 + else: + if sent[i].isspace(): + if yamok_count > 0: # remove space before yamok + continue + else: # with preprocessing above, this should not happen + output_toks.append(sent[i]) + else: + output_toks.extend([sent[i]] * (yamok_count + 1)) + yamok_count = 0 + + return output_toks[::-1] + + +
+[docs] +def maiyamok(sent: Union[str, List[str]]) -> List[str]: + """ + Expand Maiyamok. + + Deprecated. Use expand_maiyamok() instead. + + Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word + repetition. This function preprocesses Thai text by replacing + Maiyamok with a word being repeated. + + :param Union[str, List[str]] sent: sentence (list or string) + :return: list of words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import expand_maiyamok + + expand_maiyamok("คนๆนก") + # output: ['คน', 'คน', 'นก'] + """ + warn_deprecation( + "pythainlp.util.maiyamok", + "pythainlp.util.expand_maiyamok", + "5.0.5", + "5.2", + ) + return expand_maiyamok(sent)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/numtoword.html b/5.1/_modules/pythainlp/util/numtoword.html new file mode 100644 index 0000000..8a90c0c --- /dev/null +++ b/5.1/_modules/pythainlp/util/numtoword.html @@ -0,0 +1,266 @@ + + + + + + + + pythainlp.util.numtoword — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.numtoword

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Convert number value to Thai read out
+
+Adapted from
+http://justmindthought.blogspot.com/2012/12/code-php.html
+https://suksit.com/post/writing-bahttext-in-php/
+"""
+
+__all__ = ["bahttext", "num_to_thaiword"]
+
+_VALUES = [
+    "",
+    "หนึ่ง",
+    "สอง",
+    "สาม",
+    "สี่",
+    "ห้า",
+    "หก",
+    "เจ็ด",
+    "แปด",
+    "เก้า",
+]
+_PLACES = ["", "สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน"]
+_EXCEPTIONS = {"หนึ่งสิบ": "สิบ", "สองสิบ": "ยี่สิบ", "สิบหนึ่ง": "สิบเอ็ด"}
+
+
+
+[docs] +def bahttext(number: float) -> str: + """ + This function converts a number to Thai text and adds + a suffix "บาท" (Baht). + The precision will be fixed at two decimal places (0.00) + to fits "สตางค์" (Satang) unit. + This function works similar to `BAHTTEXT` function in Microsoft Excel. + + :param float number: number to be converted into Thai Baht currency format + :return: text representing the amount of money in the format + of Thai currency + :rtype: str + :Example: + :: + + from pythainlp.util import bahttext + + bahttext(1) + # output: หนึ่งบาทถ้วน + + bahttext(21) + # output: ยี่สิบเอ็ดบาทถ้วน + + bahttext(200) + # output: สองร้อยบาทถ้วน + """ + ret = "" + + if number is None: + pass + elif number == 0: + ret = "ศูนย์บาทถ้วน" + else: + num_int, num_dec = "{:.2f}".format(number).split(".") + num_int = int(num_int) + num_dec = int(num_dec) + + baht = num_to_thaiword(num_int) + if baht: + ret = "".join([ret, baht, "บาท"]) + + satang = num_to_thaiword(num_dec) + if satang and satang != "ศูนย์": + ret = "".join([ret, satang, "สตางค์"]) + else: + ret = "".join([ret, "ถ้วน"]) + + return ret
+ + + +
+[docs] +def num_to_thaiword(number: int) -> str: + """ + This function converts number to Thai text + + :param int number: an integer number to be converted to Thai text + :return: text representing the number in Thai + :rtype: str + + :Example: + :: + + from pythainlp.util import num_to_thaiword + + num_to_thaiword(1) + # output: หนึ่ง + + num_to_thaiword(11) + # output: สิบเอ็ด + """ + + output = "" + number_temp = number + if number is None: + return "" + elif number == 0: + output = "ศูนย์" + + number = str(abs(number)) + for place, value in enumerate(list(number[::-1])): + if place % 6 == 0 and place > 0: + output = _PLACES[6] + output + + if value != "0": + output = _VALUES[int(value)] + _PLACES[place % 6] + output + + for search, replac in _EXCEPTIONS.items(): + output = output.replace(search, replac) + + if number_temp < 0: + output = "ลบ" + output + + return output
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/phoneme.html b/5.1/_modules/pythainlp/util/phoneme.html new file mode 100644 index 0000000..42c1d50 --- /dev/null +++ b/5.1/_modules/pythainlp/util/phoneme.html @@ -0,0 +1,406 @@ + + + + + + + + pythainlp.util.phoneme — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.phoneme

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Phonemes util
+"""
+import unicodedata
+
+from pythainlp.tokenize import Tokenizer
+from pythainlp.util.trie import Trie
+
+consonants_ipa_nectec = [
+    ("k", "k", "k^"),
+    ("kʰ", "kh"),
+    ("ŋ", "ng", "ng^"),
+    ("tɕ", "c"),
+    ("tɕʰ", "ch"),
+    ("s", "s"),
+    ("j", "j", "j^"),
+    ("d", "d"),
+    ("t", "y", "t^"),
+    ("tʰ", "th"),
+    ("n", "n", "n^"),
+    ("b", "b"),
+    ("p", "p", "p^"),
+    ("pʰ", "ph"),
+    ("f", "f"),
+    ("m", "m", "m^"),
+    ("r", "r"),
+    ("l", "l"),
+    ("w", "w", "w^"),
+    ("h", "h"),
+    ("?", "z", "z^"),
+]
+# ipa, initial, final
+
+monophthong_ipa_nectec = [
+    ("i", "i"),
+    ("e", "e"),
+    ("ɛ", "x"),
+    ("ɤ", "q"),
+    ("a", "a"),
+    ("am", "am^"),
+    ("aj", "aj^"),
+    ("aw", "aw^"),
+    ("u", "u"),
+    ("o", "o"),
+    ("ɔ", "@"),
+    ("ii", "ii"),
+    ("ee", "ee"),
+    ("ɛɛ", "xx"),
+    ("ɯɯ", "vv"),
+    ("ɤɤ", "qq"),
+    ("aa", "aa"),
+    ("uu", "uu"),
+    ("oo", "oo"),
+    ("", "@@"),  # -อ long
+]
+
+diphthong_ipa_nectec = [
+    ("ia", "ia"),
+    ("ɯa", "va"),
+    ("ua", "ua"),
+    ("iia", "iia"),
+    ("ɯɯa", "vva"),
+    ("uua", "uua"),
+]
+
+tones_ipa_nectec = [
+    ("˧", "0"),
+    ("˨˩", "1"),
+    ("˥˩", "2"),
+    ("˦˥", "3"),
+    ("˩˩˦", "4"),
+]
+
+dict_nectec_to_ipa = {
+    i[1]: i[0]
+    for i in consonants_ipa_nectec
+    + monophthong_ipa_nectec
+    + diphthong_ipa_nectec
+    + tones_ipa_nectec
+}
+dict_nectec_to_ipa.update(
+    {i[2]: i[0] for i in consonants_ipa_nectec if len(i) > 2}
+)
+
+
+
+[docs] +def nectec_to_ipa(pronunciation: str) -> str: + """ + Convert NECTEC system to IPA system + + :param str pronunciation: NECTEC phoneme + :return: IPA that is converted + :rtype: str + + :Example: + :: + + from pythainlp.util import nectec_to_ipa + + print(nectec_to_ipa("kl-uua-j^-2")) + # output : 'kl uua j ˥˩' + + + References + ---------- + + Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing \ + and Speech corpus. In: Handbook of Thai Electronic Corpus. \ + 1st ed. p. 122–56. + """ + parts = pronunciation.split("-") + ipa = [] + for part in parts: + if part in dict_nectec_to_ipa.keys(): + ipa.append(dict_nectec_to_ipa[part]) + else: + ipa.append(part) + return " ".join(ipa)
+ + + +dict_ipa_rtgs = { + "b": "b", + "d": "d", + "f": "f", + "h": "h", + # The conversion of j depends on its position in the syllable. + # But, unfortunately, the current implementation cannot handle both cases. + # To remove confusions without changing the behavior and breaking existing codes, + # it is suggested that the first key-value mapping of j be simply commented out, + # as it would be overridden by the second one and thus never take effect from the beginning. + # See #846 for a more detailed discussion: https://github.com/PyThaiNLP/pythainlp/issues/846 + # "j":"y", + "k": "k", + "kʰ": "kh", + "l": "l", + "m": "m", + "n": "n", + "ŋ": "ng", + "p": "p", + "pʰ": "ph", + "r": "r", + "s": "s", + "t": "t", + "tʰ": "th", + "tɕ": "ch", + "tɕʰ": "ch", + "w": "w", + "ʔ": "", + "j": "i", + "a": "a", + "e": "e", + "ɛ": "ae", + "i": "i", + "o": "o", + "ɔ": "o", + "u": "u", + "ɯ": "ue", + "ɤ": "oe", + "aː": "a", + "eː": "e", + "ɛː": "ae", + "iː": "i", + "oː": "o", + "ɔː": "o", + "uː": "u", + "ɯː": "ue", + "ɤː": "oe", + "ia": "ia", + "ua": "ua", + "ɯa": "uea", + "aj": "ai", + "aw": "ao", + "ew": "eo", + "ɛw": "aeo", + "iw": "io", + "ɔj": "io", + "uj": "ui", + "aːj": "ai", + "aːw": "ao", + "eːw": "eo", + "ɛːw": "aeo", + "oːj": "oi", + "ɔːj": "oi", + "ɤːj": "oei", + "iaw": "iao", + "uaj": "uai", + "ɯaj": "ueai", + ".": ".", +} + +dict_ipa_rtgs_final = {"w": "o"} +trie = Trie(list(dict_ipa_rtgs.keys()) + list(dict_ipa_rtgs_final.keys())) +ipa_cut = Tokenizer(custom_dict=trie, engine="newmm") + + +
+[docs] +def ipa_to_rtgs(ipa: str) -> str: + """ + Convert IPA system to The Royal Thai General System of Transcription (RTGS) + + Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai + + :param str ipa: IPA phoneme + :return: The RTGS that is converted, according to rules listed in the Wikipedia page + :rtype: str + + :Example: + :: + + from pythainlp.util import ipa_to_rtgs + + print(ipa_to_rtgs("kluaj")) + # output : 'kluai' + + """ + rtgs_parts = [] + + ipa_parts = ipa_cut.word_tokenize(ipa) + for i, ipa_part in enumerate(ipa_parts): + if i == len(ipa_parts) - 1 and ipa_part in list(dict_ipa_rtgs_final): + rtgs_parts.append(dict_ipa_rtgs_final[ipa_part]) + elif ipa_part in list(dict_ipa_rtgs): + rtgs_parts.append(dict_ipa_rtgs[ipa_part]) + else: + rtgs_parts.append(ipa_part) + + rtgs = "".join(rtgs_parts) + rtgs = ( + unicodedata.normalize("NFKD", rtgs) + .encode("ascii", "ignore") + .decode("utf-8") + ) + + return rtgs
+ + + +
+[docs] +def remove_tone_ipa(ipa: str) -> str: + """ + Remove Thai Tones from IPA system + + :param str ipa: IPA phoneme + :return: IPA phoneme with tones removed + :rtype: str + + :Example: + :: + + from pythainlp.util import remove_tone_ipa + + print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦")) + # output : laː.sa.maj + + """ + _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"] + for tone in _list_tone: + ipa = ipa.replace(tone, "") + return ipa
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/pronounce.html b/5.1/_modules/pythainlp/util/pronounce.html new file mode 100644 index 0000000..cd4514f --- /dev/null +++ b/5.1/_modules/pythainlp/util/pronounce.html @@ -0,0 +1,329 @@ + + + + + + + + pythainlp.util.pronounce — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.pronounce

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List
+import re
+
+from pythainlp.corpus import thai_words
+from pythainlp.khavee import KhaveeVerifier
+from pythainlp.tokenize import syllable_tokenize
+from pythainlp.tokenize import Tokenizer
+from pythainlp import thai_consonants, thai_tonemarks
+from pythainlp.util import remove_tonemark
+
+kv = KhaveeVerifier()
+all_thai_words_dict = None
+
+
+
+[docs] +def rhyme(word: str) -> List[str]: + """ + Find Thai rhyme + + :param str word: A Thai word + :return: All list Thai rhyme words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import rhyme + + print(rhyme("จีบ")) + # output: ['กลีบ', 'กีบ', 'ครีบ', ...] + """ + global all_thai_words_dict + list_sumpus = [] + if all_thai_words_dict is None: + all_thai_words_dict = [ + i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1 + ] + for i in all_thai_words_dict: + if kv.is_sumpus(word, i) and i != word: + list_sumpus.append(i) + return sorted(list_sumpus)
+ + + +thai_vowel = ''.join(( + "อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,", + "โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ" +)).split(",") +thai_vowel_all = [ + ("([ก-ฮ])ะ", "\\1อะ"), + ("([ก-ฮ])า", "\\1อา"), + ("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")), + ("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")), + ("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)), + ("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)), + ("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)), + ("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)), + ("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"), + ("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"), + ("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"), + ("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"), + ("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"), + ("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"), + ("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"), + ("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"), + ("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"), + ("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"), + ("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"), + ("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"), + ("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"), + ("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"), + ("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"), + ("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"), + ("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"), + ("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"), + ("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"), + ("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"), + ("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"), +] +thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True) + + +
+[docs] +def thai_consonant_to_spelling(c: str) -> str: + """ + Thai consonants to spelling + + :param str c: A Thai consonant + :return: spelling + :rtype: str + + :Example: + :: + + from pythainlp.util import thai_consonant_to_spelling + + print(tone_to_spelling("ก")) + # output: กอ + """ + if len(c) == 1 and c in thai_consonants: + return c + "อ" + return c
+ + + +
+[docs] +def tone_to_spelling(t: str) -> str: + """ + Thai tonemarks to spelling + + :param str t: A Thai tonemarks + :return: spelling + :rtype: str + + :Example: + :: + + from pythainlp.util import tone_to_spelling + + print(tone_to_spelling("่")) # ไม้เอก + # output: ไม้เอก + """ + if t == "่": + return "ไม้เอก" + elif t == "้": + return "ไม้โท" + elif t == "๊": + return "ไม้ตรี" + elif t == "๋": + return "ไม้จัตวา" + return t
+ + + +def spelling(word: str) -> List[str]: + """ + Thai word to spelling + + This funnction support Thai root word only. + + :param str word: A Thai word + :return: spelling + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import spelling + + print(spelling("เรียน")) + # output: ['รอ', 'เอีย', 'นอ', 'เรียน'] + + print(spelling("เฝ้า) + # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า'] + """ + if not word or not isinstance(word, str): + return [] + thai_vowel_tokenizer = Tokenizer( + custom_dict=thai_vowel + list(thai_consonants), + engine="longest" + ) + word_pre = remove_tonemark(word).replace("็", "") + tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks] + word_output = word_pre + for i, j in thai_vowel_all: + if len(re.findall(i, word_pre, re.U)) > 0: + if "็" in word and i == "เ([ก-ฮ])": + word_output = re.sub(i, "\\1เอะ", word_pre) + else: + word_output = re.sub(i, j, word_pre) + break + list_word_output = thai_vowel_tokenizer.word_tokenize(word_output) + output = [ + i for i in [thai_consonant_to_spelling(i) for i in list_word_output] + if '์' not in i + ] + if word_pre == word: + return output + [word] + elif tone != []: + return output + [word_pre, tone[0], word] + elif "็" in word: + return output + [word] + else: + return output + [word_pre, word] +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/spell_words.html b/5.1/_modules/pythainlp/util/spell_words.html new file mode 100644 index 0000000..872c6c1 --- /dev/null +++ b/5.1/_modules/pythainlp/util/spell_words.html @@ -0,0 +1,274 @@ + + + + + + + + pythainlp.util.spell_words — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.spell_words

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import re
+from typing import List
+
+from pythainlp import (
+    thai_above_vowels,
+    thai_below_vowels,
+    thai_consonants,
+    thai_follow_vowels,
+    thai_lead_vowels,
+    thai_letters,
+    thai_tonemarks,
+)
+from pythainlp.tokenize import Tokenizer, subword_tokenize
+
+_r1 = ["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"]
+_r2 = ["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"]
+tonemarks = {
+    i: "ไม้" + j
+    for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])
+}
+
+rule1 = [i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
+rule2 = [i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2]
+rule3 = [
+    i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])")
+    for i in _r2
+]
+dict_vowel_ex = {}
+for i in _r1 + _r2:
+    dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
+        i.replace("-", "อ").replace(":", "").replace("–", "อ")
+    )
+dict_vowel = {}
+for i in _r1 + _r2:
+    dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")] = (
+        i.replace("-", "อ").replace(":", "").replace("–", "อ")
+    )
+for i in thai_lead_vowels:
+    dict_vowel[i] = i + "อ"
+for i in thai_follow_vowels:
+    dict_vowel[i] = "อ" + i
+for i in thai_above_vowels:
+    dict_vowel[i] = "อ" + i
+for i in thai_below_vowels:
+    dict_vowel[i] = "อ" + i
+
+_cut = Tokenizer(list(dict_vowel.keys()) + list(thai_consonants), engine="mm")
+
+
+def _clean(w):
+    if bool(re.match("|".join(rule3), w)):
+        for r in rule3:
+            if bool(re.match(r, w)):
+                w = re.sub(r, "\\1==\\2==", w)
+                temp = w.split("==")
+                w = (
+                    temp[0]
+                    + r.replace(f"([{thai_letters}])", "อ").replace(
+                        f"([{thai_tonemarks}])", ""
+                    )
+                    + temp[1]
+                )
+    elif bool(re.match("|".join(rule2), w)):
+        for r in rule2:
+            if bool(re.match(r, w)):
+                w = re.sub(r, "\\1", w) + r.replace(f"([{thai_letters}])", "อ")
+    elif bool(re.match("|".join(rule1), w)):
+        for r in rule1:
+            if bool(re.match(r, w)):
+                w = re.sub(r, "\\1", w) + r.replace(
+                    f"([{thai_letters}](thai_tonemarks)?)", "อ"
+                )
+    return w
+
+
+
+[docs] +def spell_syllable(text: str) -> List[str]: + """ + Spell out syllables in Thai word distribution form. + + :param str s: Thai syllables only + :return: List of spelled out syllables + :rtype: List[str] + + :Example: + :: + + from pythainlp.util.spell_words import spell_syllable + + print(spell_syllable("แมว")) + # output: ['มอ', 'วอ', 'แอ', 'แมว'] + """ + tokens = _cut.word_tokenize(_clean(text)) + + c_only = [tok + "อ" for tok in tokens if tok in set(thai_consonants)] + v_only = [dict_vowel[tok] for tok in tokens if tok in set(dict_vowel)] + t_only = [tonemarks[tok] for tok in tokens if tok in set(tonemarks.keys())] + + return c_only + v_only + t_only + [text]
+ + + +
+[docs] +def spell_word(text: str) -> List[str]: + """ + Spell out words in Thai word distribution form. + + :param str w: Thai words only + :return: List of spelled out words + :rtype: List[str] + + :Example: + :: + + from pythainlp.util.spell_words import spell_word + + print(spell_word("คนดี")) + # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'] + """ + spellouts = [] + tokens = subword_tokenize(text, engine="han_solo") + + for tok in tokens: + spellouts.extend(spell_syllable(tok)) + + if len(tokens) > 1: + spellouts.append(text) + + return spellouts
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/strftime.html b/5.1/_modules/pythainlp/util/strftime.html new file mode 100644 index 0000000..395ce70 --- /dev/null +++ b/5.1/_modules/pythainlp/util/strftime.html @@ -0,0 +1,483 @@ + + + + + + + + pythainlp.util.strftime — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.strftime

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Thai date/time formatting.
+"""
+
+import warnings
+from datetime import datetime
+from string import digits
+
+from pythainlp import thai_digits
+from pythainlp.util.date import (
+    thai_abbr_months,
+    thai_abbr_weekdays,
+    thai_full_months,
+    thai_full_weekdays,
+)
+
+__all__ = [
+    "thai_strftime",
+]
+
+_HA_TH_DIGITS = str.maketrans(digits, thai_digits)
+_BE_AD_DIFFERENCE = 543
+
+_NEED_L10N = "AaBbCcDFGgvXxYy+"  # flags that need localization
+_EXTENSIONS = "EO-_0^#"  # extension flags
+
+
+def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
+    """
+    Standard datetime.strftime() with normalization and exception handling.
+    """
+    str_ = ""
+    try:
+        str_ = dt_obj.strftime(f"%{fmt_char}")
+        if not str_ or str_ == "%{}".format(fmt_char):
+            # Normalize outputs for unsupported directives
+            # in different platforms:
+            # "%Q" may result "", "%Q", or "Q", make it all "Q"
+            str_ = fmt_char
+    except ValueError as err:
+        # Unsupported directives may raise ValueError on Windows,
+        # in that case just use the fmt_char
+        warnings.warn(
+            (
+                f"String format directive unknown/not support: %{fmt_char}\n"
+                f"The system raises this ValueError: {err}\n"
+                f"Continue working without the directive."
+            ),
+            UserWarning,
+        )
+        str_ = fmt_char
+    return str_
+
+
+def _thai_strftime(dt_obj: datetime, fmt_char: str) -> str:
+    """
+    Conversion support for thai_strftime().
+
+    The fmt_char should be in _NEED_L10N when calling this function.
+    """
+    str_ = ""
+    if fmt_char == "A":
+        # National representation of the full weekday name
+        str_ = thai_full_weekdays[dt_obj.weekday()]
+    elif fmt_char == "a":
+        # National representation of the abbreviated weekday
+        str_ = thai_abbr_weekdays[dt_obj.weekday()]
+    elif fmt_char == "B":
+        # National representation of the full month name
+        str_ = thai_full_months[dt_obj.month - 1]
+    elif fmt_char == "b":
+        # National representation of the abbreviated month name
+        str_ = thai_abbr_months[dt_obj.month - 1]
+    elif fmt_char == "C":
+        # Thai Buddhist century (AD+543)/100 + 1 as decimal number;
+        str_ = str(int((dt_obj.year + _BE_AD_DIFFERENCE) / 100) + 1).zfill(2)
+    elif fmt_char == "c":
+        # Locale's appropriate date and time representation
+        # Wed  6 Oct 01:40:00 1976
+        # พ   6 ต.ค. 01:40:00 2519  <-- left-aligned weekday, right-aligned day
+        str_ = "{:<2} {:>2} {} {} {}".format(
+            thai_abbr_weekdays[dt_obj.weekday()],
+            dt_obj.day,
+            thai_abbr_months[dt_obj.month - 1],
+            dt_obj.strftime("%H:%M:%S"),
+            str(dt_obj.year + _BE_AD_DIFFERENCE).zfill(4),
+        )
+    elif fmt_char == "D":
+        # Equivalent to ``%m/%d/%y''
+        str_ = "{}/{}".format(
+            dt_obj.strftime("%m/%d"),
+            (str(dt_obj.year + _BE_AD_DIFFERENCE)[-2:]).zfill(2),
+        )
+    elif fmt_char == "F":
+        # Equivalent to ``%Y-%m-%d''
+        str_ = "{}-{}".format(
+            str(dt_obj.year + _BE_AD_DIFFERENCE).zfill(4),
+            dt_obj.strftime("%m-%d"),
+        )
+    elif fmt_char == "G":
+        # ISO 8601 year with century representing the year that contains
+        # the greater part of the ISO week (%V). Monday as the first day
+        # of the week.
+        str_ = str(int(dt_obj.strftime("%G")) + _BE_AD_DIFFERENCE).zfill(4)
+    elif fmt_char == "g":
+        # Same year as in ``%G'',
+        # but as a decimal number without century (00-99).
+        str_ = (
+            str(int(dt_obj.strftime("%G")) + _BE_AD_DIFFERENCE)[-2:]
+        ).zfill(2)
+    elif fmt_char == "v":
+        # BSD extension, ' 6-Oct-1976'
+        str_ = "{:>2}-{}-{}".format(
+            dt_obj.day,
+            thai_abbr_months[dt_obj.month - 1],
+            str(dt_obj.year + _BE_AD_DIFFERENCE).zfill(4),
+        )
+    elif fmt_char == "X":
+        # Locale’s appropriate time representation.
+        str_ = dt_obj.strftime("%H:%M:%S")
+    elif fmt_char == "x":
+        # Locale’s appropriate date representation.
+        str_ = "{}/{}/{}".format(
+            str(dt_obj.day).zfill(2),
+            str(dt_obj.month).zfill(2),
+            str(dt_obj.year + _BE_AD_DIFFERENCE).zfill(4),
+        )
+    elif fmt_char == "Y":
+        # Year with century
+        str_ = (str(dt_obj.year + _BE_AD_DIFFERENCE)).zfill(4)
+    elif fmt_char == "y":
+        # Year without century
+        str_ = (str(dt_obj.year + _BE_AD_DIFFERENCE)[-2:]).zfill(2)
+    elif fmt_char == "+":
+        # National representation of the date and time
+        # (the format is similar to that produced by date(1))
+        # Wed  6 Oct 1976 01:40:00
+        str_ = "{:<2} {:>2} {} {} {}".format(
+            thai_abbr_weekdays[dt_obj.weekday()],
+            dt_obj.day,
+            thai_abbr_months[dt_obj.month - 1],
+            dt_obj.year + _BE_AD_DIFFERENCE,
+            dt_obj.strftime("%H:%M:%S"),
+        )
+    else:
+        # No known localization available, use Python's default
+        # With a good _NEED_L10N and _EXTENSIONS, this should not happen
+        str_ = _std_strftime(dt_obj, fmt_char)  # pragma: no cover
+
+    return str_
+
+
+
+[docs] +def thai_strftime( + dt_obj: datetime, + fmt: str = "%-d %b %y", + thaidigit: bool = False, +) -> str: + """ + Convert :class:`datetime.datetime` into Thai date and time format. + + The formatting directives are similar to :func:`datatime.strrftime`. + + This function uses Thai names and Thai Buddhist Era for these directives: + * **%a** - abbreviated weekday name + (i.e. "จ", "อ", "พ", "พฤ", "ศ", "ส", "อา") + * **%A** - full weekday name + (i.e. "วันจันทร์", "วันอังคาร", "วันเสาร์", "วันอาทิตย์") + * **%b** - abbreviated month name + (i.e. "ม.ค.","ก.พ.","มี.ค.","เม.ย.","พ.ค.","มิ.ย.", "ธ.ค.") + * **%B** - full month name + (i.e. "มกราคม", "กุมภาพันธ์", "พฤศจิกายน", "ธันวาคม",) + * **%y** - year without century (i.e. "56", "10") + * **%Y** - year with century (i.e. "2556", "2410") + * **%c** - date and time representation + (i.e. "พ 6 ต.ค. 01:40:00 2519") + * **%v** - short date representation + (i.e. " 6-ม.ค.-2562", "27-ก.พ.-2555") + + Other directives will be passed to datetime.strftime() + + :Note: + * The Thai Buddhist Era (BE) year is simply converted from AD + by adding 543. This is certainly not accurate for years + before 1941 AD, due to the change in Thai New Year's Day. + * This meant to be an interim solution, since + Python standard's locale module (which relied on C's strftime()) + does not support "th" or "th_TH" locale yet. If supported, + we can just locale.setlocale(locale.LC_TIME, "th_TH") + and then use native datetime.strftime(). + + We are trying to make this platform-independent and support extensions + as many as possible. See these links for strftime() extensions + in POSIX, BSD, and GNU libc: + + * Python + https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior + * C http://www.cplusplus.com/reference/ctime/strftime/ + * GNU https://metacpan.org/pod/POSIX::strftime::GNU + * Linux https://linux.die.net/man/3/strftime + * OpenBSD https://man.openbsd.org/strftime.3 + * FreeBSD https://www.unix.com/man-page/FreeBSD/3/strftime/ + * macOS + https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/strftime.3.html + * PHP https://secure.php.net/manual/en/function.strftime.php + * JavaScript's implementation https://github.com/samsonjs/strftime + * strftime() quick reference http://www.strftime.net/ + + :param datetime dt_obj: an instantiatetd object of + :mod:`datetime.datetime` + :param str fmt: string containing date and time directives + :param bool thaidigit: If `thaidigit` is set to **False** (default), + number will be represented in Arabic digit. + If it is set to **True**, it will be represented + in Thai digit. + + :return: Date and time text, with month in Thai name and year in + Thai Buddhist era. The year is simply converted from AD + by adding 543 (will not accurate for years before 1941 AD, + due to change in Thai New Year's Day). + :rtype: str + + :Example: + :: + + from datetime import datetime + from pythainlp.util import thai_strftime + + datetime_obj = datetime(year=2019, month=6, day=9, \\ + hour=5, minute=59, second=0, microsecond=0) + + print(datetime_obj) + # output: 2019-06-09 05:59:00 + + thai_strftime(datetime_obj, "%A %d %B %Y") + # output: 'วันอาทิตย์ 09 มิถุนายน 2562' + + thai_strftime(datetime_obj, "%a %-d %b %y") # no padding + # output: 'อา 9 มิ.ย. 62' + + thai_strftime(datetime_obj, "%a %_d %b %y") # space padding + # output: 'อา 9 มิ.ย. 62' + + thai_strftime(datetime_obj, "%a %0d %b %y") # zero padding + # output: 'อา 09 มิ.ย. 62' + + thai_strftime(datetime_obj, "%-H นาฬิกา %-M นาที", thaidigit=True) + # output: '๕ นาฬิกา ๕๙ นาที' + + thai_strftime(datetime_obj, "%D (%v)") + # output: '06/09/62 ( 9-มิ.ย.-2562)' + + thai_strftime(datetime_obj, "%c") + # output: 'อา 9 มิ.ย. 05:59:00 2562' + + thai_strftime(datetime_obj, "%H:%M %p") + # output: '01:40 AM' + + thai_strftime(datetime_obj, "%H:%M %#p") + # output: '01:40 am' + """ + thaidate_parts = [] + + i = 0 + fmt_len = len(fmt) + while i < fmt_len: + str_ = "" + if fmt[i] == "%": + j = i + 1 + if j < fmt_len: + fmt_char = fmt[j] + if fmt_char in _NEED_L10N: # requires localization? + str_ = _thai_strftime(dt_obj, fmt_char) + elif fmt_char in _EXTENSIONS: + fmt_char_ext = fmt_char + k = j + 1 + if k < fmt_len: + fmt_char = fmt[k] + if fmt_char in _NEED_L10N: + str_ = _thai_strftime(dt_obj, fmt_char) + else: + str_ = _std_strftime(dt_obj, fmt_char) + + if fmt_char_ext == "-": + # GNU libc extension, + # no padding + if str_[0] and str_[0] in " 0": + str_ = str_[1:] + elif fmt_char_ext == "_": + # GNU libc extension, + # explicitly specify space (" ") for padding + if str_[0] and str_[0] == "0": + str_ = " " + str_[1:] + elif fmt_char_ext == "0": + # GNU libc extension, + # explicitly specify zero ("0") for padding + if str_[0] and str_[0] == " ": + str_ = "0" + str_[1:] + elif fmt_char_ext == "^": + # GNU libc extension, + # convert to upper case + str_ = str_.upper() + elif fmt_char_ext == "#": + # GNU libc extension, + # swap case - useful for %Z + str_ = str_.swapcase() + elif fmt_char_ext == "E": + # POSIX extension, + # uses the locale's alternative representation + # Not implemented yet + pass + elif fmt_char_ext == "O": + # POSIX extension, + # uses the locale's alternative numeric symbols + str_ = str_.translate(_HA_TH_DIGITS) + i = i + 1 # consume char after format char + else: + # format char at string's end has no meaning + str_ = fmt_char_ext + else: # not in _NEED_L10N nor _EXTENSIONS + # no known localization available, use Python's default + str_ = _std_strftime(dt_obj, fmt_char) + + i = i + 1 # consume char after "%" + else: + # % char at string's end has no meaning + str_ = "%" + else: + str_ = fmt[i] + + thaidate_parts.append(str_) + i = i + 1 + + thaidate_text = "".join(thaidate_parts) + + if thaidigit: + thaidate_text = thaidate_text.translate(_HA_TH_DIGITS) + + return thaidate_text
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/syllable.html b/5.1/_modules/pythainlp/util/syllable.html new file mode 100644 index 0000000..64d9e4c --- /dev/null +++ b/5.1/_modules/pythainlp/util/syllable.html @@ -0,0 +1,507 @@ + + + + + + + + pythainlp.util.syllable — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.syllable

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Syllable tools
+"""
+
+import re
+
+from pythainlp import thai_consonants, thai_tonemarks
+
+spelling_class = {
+    "กง": list("ง"),
+    "กม": list("ม"),
+    "เกย": list("ย"),
+    "เกอว": list("ว"),
+    "กน": list("นญณรลฬ"),
+    "กก": list("กขคฆ"),
+    "กด": list("ดจชซฎฏฐฑฒตถทธศษส"),
+    "กบ": list("บปภพฟ"),
+}
+
+thai_consonants_all = list(thai_consonants)
+thai_consonants_all.remove("อ")
+
+_temp = list("".join(["".join(v) for v in spelling_class.values()]))
+not_spelling_class = [j for j in thai_consonants_all if j not in _temp]
+
+# vowel's short sound
+short = "ะัิึุ"
+re_short = re.compile("เ(.*)ะ|แ(.*)ะ|เ(.*)อะ|โ(.*)ะ|เ(.*)าะ", re.U)
+pattern = re.compile("เ(.*)า", re.U)  # เ-า is live syllable
+
+_check_1 = []
+# These spelling consonant ares live syllables.
+for i in ["กง", "กน", "กม", "เกย", "เกอว"]:
+    _check_1.extend(spelling_class[i])
+
+# These spelling consonants are dead syllables.
+_check_2 = spelling_class["กก"] + spelling_class["กบ"] + spelling_class["กด"]
+
+thai_low_sonorants = list("งนมยรลว")
+thai_low_aspirates = list("คชซทพฟฮ")
+thai_low_irregular = list("ฆญณธภฅฌฑฒฬ")
+
+thai_mid_plains = list("กจดตบปอฎฏ")
+
+thai_high_aspirates = list("ขฉถผฝสห")
+thai_high_irregular = list("ศษฃฐ")
+thai_initial_consonant_type = {
+    "low": thai_low_sonorants + thai_low_aspirates + thai_low_irregular,
+    "mid": thai_mid_plains,
+    "high": thai_high_aspirates + thai_high_irregular,
+}
+thai_initial_consonant_to_type = {}
+
+for k, v in thai_initial_consonant_type.items():
+    for i in v:
+        thai_initial_consonant_to_type[i] = k
+
+
+
+[docs] +def sound_syllable(syllable: str) -> str: + """ + Sound syllable classification + + This function is sound syllable classification. + The syllable is a live syllable or dead syllable. + + :param str syllable: Thai syllable + :return: syllable's type ("live" or "dead") + :rtype: str + + :Example: + :: + + from pythainlp.util import sound_syllable + + print(sound_syllable("มา")) + # output: live + + print(sound_syllable("เลข")) + # output: dead + """ + # if len of syllable < 2 + if len(syllable) < 2: + return "dead" + + # get consonants + consonants = [i for i in syllable if i in list(thai_consonants_all)] + if ( + (len(consonants) == 0) + and ("อ" in syllable) + and any((c in set("เ")) for c in syllable) + and (len(syllable) == 2) + ): + return "live" + + # get spelling consonants + spelling_consonant = consonants[-1] + if (spelling_consonant in _check_2) and ( + any((c in set("าีืแูาเโ")) for c in syllable) is False + and any((c in set("ำใไ")) for c in syllable) is False + and bool(pattern.search(syllable)) is not True + ): + return "dead" + + if any((c in set("าีืแูาโ")) for c in syllable): # in syllable: + if ( + spelling_consonant in _check_1 + and bool(re_short.search(syllable)) is not True + ): + return "live" + + if ( + spelling_consonant != syllable[-1] + and bool(re_short.search(syllable)) is not True + ): + return "live" + + if spelling_consonant in _check_2: + return "dead" + + if bool(re_short.search(syllable)) or any( + (c in set(short)) for c in syllable + ): + return "dead" + + return "live" + + if any((c in set("ำใไ")) for c in syllable): + return "live" # if these vowel's long sounds are live syllables + + if bool(pattern.search(syllable)): # if it is เ-า + return "live" + + if spelling_consonant in _check_1: + if ( + bool(re_short.search(syllable)) + or any((c in set(short)) for c in syllable) + ) and len(consonants) < 2: + return "dead" + + if syllable[-1] in set(short): + return "dead" + + return "live" + + if bool( + re_short.search(syllable) + ) or any( # if vowel's short sound is found + (c in set(short)) for c in syllable + ): # consonant in short + return "dead" + + return "dead"
+ + + +
+[docs] +def syllable_open_close_detector(syllable: str) -> str: + """ + Open/close Thai syllables detector + + This function is used for finding Thai syllables that are open or closed sound. + + :param str syllable: Thai syllable + :return: open / close + :rtype: str + + :Example: + :: + + from pythainlp.util import syllable_open_close_detector + + print(syllable_open_close_detector("มาก")) + # output: close + + print(syllable_open_close_detector("คะ")) + # output: open + """ + consonants = [i for i in syllable if i in list(thai_consonants)] + + if len(consonants) < 2: + return "open" + + if len(consonants) == 2 and consonants[-1] == "อ": + return "open" + + return "close"
+ + + +
+[docs] +def syllable_length(syllable: str) -> str: + """ + Thai syllable length + + This function is used for finding syllable's length. (long or short) + + :param str syllable: Thai syllable + :return: syllable's length (long or short) + :rtype: str + + :Example: + :: + + from pythainlp.util import syllable_length + + print(syllable_length("มาก")) + # output: long + + print(syllable_length("คะ")) + # output: short + """ + consonants = [i for i in syllable if i in list(thai_consonants)] + if len(consonants) <= 3 and any((c in set(short)) for c in syllable): + return "short" + + if bool(re_short.search(syllable)): + return "short" + + return "long"
+ + + +def _tone_mark_detector(syllable: str) -> str: + tone_mark = [i for i in syllable if i in list(thai_tonemarks)] + if tone_mark == []: + return "" + + return tone_mark[0] + + +def _check_sonorant_syllable(syllable: str) -> bool: + _sonorant = [i for i in syllable if i in thai_low_sonorants] + consonants = [i for i in syllable if i in list(thai_consonants)] + + if _sonorant[-1] == consonants[-2]: + return True + + if _sonorant[-1] == consonants[-1]: + return True + + return False + + +
+[docs] +def tone_detector(syllable: str) -> str: + """ + Thai tone detector for syllables + + Return tone of a syllable. + + - l: low + - m: mid + - r: rising + - f: falling + - h: high + - empty string: cannot be detected + + :param str syllable: Thai syllable + :return: syllable's tone (l, m, h, r, f) or empty if it cannot be detected + :rtype: str + + :Example: + :: + + from pythainlp.util import tone_detector + + print(tone_detector("มา")) + # output: m + + print(tone_detector("ไม้")) + # output: h + """ + s = sound_syllable(syllable) + # get consonants + consonants = [i for i in syllable if i in list(thai_consonants)] + initial_consonant = consonants[0] + tone_mark = _tone_mark_detector(syllable) + syllable_check = syllable_open_close_detector(syllable) + syllable_check_length = syllable_length(syllable) + initial_consonant_type = thai_initial_consonant_to_type[initial_consonant] + # r for store value + r = "" + if len(consonants) > 1 and (initial_consonant in ("อ", "ห")): + consonant_ending = _check_sonorant_syllable(syllable) + if ( + initial_consonant == "อ" + and consonant_ending + and s == "live" + and tone_mark == "่" + ): + r = "l" + elif ( + initial_consonant == "ห" + and consonant_ending + and s == "live" + and tone_mark == "่" + ): + r = "l" + elif initial_consonant == "อ" and consonant_ending and s == "dead": + r = "l" + elif ( + initial_consonant == "ห" + and consonant_ending + and s == "live" + and tone_mark == "้" + ): + r = "f" + elif initial_consonant == "ห" and consonant_ending and s == "dead": + r = "l" + elif initial_consonant == "ห" and consonant_ending and s == "live": + r = "r" + elif initial_consonant_type == "high" and s == "live" and tone_mark == "่": + r = "l" + elif initial_consonant_type == "mid" and s == "live" and tone_mark == "่": + r = "l" + elif initial_consonant_type == "low" and tone_mark == "้": + r = "h" + elif initial_consonant_type == "mid" and tone_mark == "๋": + r = "r" + elif initial_consonant_type == "mid" and tone_mark == "๊": + r = "h" + elif initial_consonant_type == "low" and tone_mark == "่": + r = "f" + elif initial_consonant_type == "mid" and tone_mark == "้": + r = "f" + elif initial_consonant_type == "high" and tone_mark == "้": + r = "f" + elif ( + initial_consonant_type == "low" + and syllable_check_length == "short" + and syllable_check == "close" + and s == "dead" + ): + r = "h" + elif ( + initial_consonant_type == "low" + and syllable_check_length == "long" + and syllable_check == "close" + and s == "dead" + ): + r = "f" + elif ( + initial_consonant_type == "low" + and syllable_check_length == "short" + and syllable_check == "open" + ): + r = "h" + elif initial_consonant_type == "mid" and s == "dead": + r = "l" + elif initial_consonant_type == "high" and s == "dead": + r = "l" + elif initial_consonant_type == "low" and s == "live": + r = "m" + elif initial_consonant_type == "mid" and s == "live": + r = "m" + elif initial_consonant_type == "high" and s == "live": + r = "r" + + return r
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/thai.html b/5.1/_modules/pythainlp/util/thai.html new file mode 100644 index 0000000..fced99b --- /dev/null +++ b/5.1/_modules/pythainlp/util/thai.html @@ -0,0 +1,427 @@ + + + + + + + + pythainlp.util.thai — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.thai

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Check if it is Thai text
+"""
+
+import string
+from typing import Tuple
+
+from pythainlp import (
+    thai_above_vowels,
+    thai_below_vowels,
+    thai_consonants,
+    thai_digits,
+    thai_follow_vowels,
+    thai_lead_vowels,
+    thai_punctuations,
+    thai_signs,
+    thai_tonemarks,
+    thai_vowels,
+)
+
+_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
+_TH_FIRST_CHAR_ASCII = 3584
+_TH_LAST_CHAR_ASCII = 3711
+
+
+
+[docs] +def isthaichar(ch: str) -> bool: + """Check if a character is a Thai character. + + :param ch: input character + :type ch: str + :return: True if ch is a Thai character, otherwise False. + :rtype: bool + + :Example: + :: + + from pythainlp.util import isthaichar + + isthaichar("ก") # THAI CHARACTER KO KAI + # output: True + + isthaichar("๕") # THAI DIGIT FIVE + # output: True + """ + ch_val = ord(ch) + if _TH_FIRST_CHAR_ASCII <= ch_val <= _TH_LAST_CHAR_ASCII: + return True + return False
+ + + +
+[docs] +def isthai(text: str, ignore_chars: str = ".") -> bool: + """Check if every character in a string is a Thai character. + + :param text: input text + :type text: str + :param ignore_chars: characters to be ignored, defaults to "." + :type ignore_chars: str, optional + :return: True if every character in the input string is Thai, + otherwise False. + :rtype: bool + + :Example: + :: + + from pythainlp.util import isthai + + isthai("กาลเวลา") + # output: True + + isthai("กาลเวลา.") + # output: True + + isthai("กาล-เวลา") + # output: False + + isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,") + # output: True + + """ + if not ignore_chars: + ignore_chars = "" + + for ch in text: + if ch not in ignore_chars and not isthaichar(ch): + return False + return True
+ + + +
+[docs] +def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: + """Find proportion of Thai characters in a given text + + :param text: input text + :type text: str + :param ignore_chars: characters to be ignored, defaults to whitespace,\\ + digits, and punctuation marks. + :type ignore_chars: str, optional + :return: proportion of Thai characters in the text (percentage) + :rtype: float + + :Example: + :: + + from pythainlp.util import countthai + + countthai("ไทยเอ็นแอลพี 3.0") + # output: 100.0 + + countthai("PyThaiNLP 3.0") + # output: 0.0 + + countthai("ใช้งาน PyThaiNLP 3.0") + # output: 40.0 + + countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") + # output: 30.0 + """ + if not text or not isinstance(text, str): + return 0.0 + + if not ignore_chars: + ignore_chars = "" + + num_thai = 0 + num_ignore = 0 + + for ch in text: + if ch in ignore_chars: + num_ignore += 1 + elif isthaichar(ch): + num_thai += 1 + + num_count = len(text) - num_ignore + + if num_count == 0: + return 0.0 + + return (num_thai / num_count) * 100
+ + + +
+[docs] +def display_thai_char(ch: str) -> str: + """Prefix an underscore (_) to a high-position vowel or a tone mark, + to ease readability. + + :param ch: input character + :type ch: str + :return: "_" + ch + :rtype: str + + :Example: + :: + + from pythainlp.util import display_thai_char + + display_thai_char("้") + # output: "_้" + """ + + if ( + ch in thai_above_vowels + or ch in thai_tonemarks + or ch in "\u0e33\u0e4c\u0e4d\u0e4e" + ): + # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan + return "_" + ch + else: + return ch
+ + + +
+[docs] +def thai_word_tone_detector(word: str) -> Tuple[str, str]: + """ + Thai tone detector for word. + + It uses pythainlp.transliterate.pronunciate for converting word to\ + pronunciation. + + :param str word: Thai word. + :return: Thai pronunciation with tones in each syllable.\ + (l, m, h, r, f or empty if it cannot be detected) + :rtype: Tuple[str, str] + + :Example: + :: + + from pythainlp.util import thai_word_tone_detector + + print(thai_word_tone_detector("คนดี")) + # output: [('คน', 'm'), ('ดี', 'm')] + + print(thai_word_tone_detector("มือถือ")) + # output: [('มือ', 'm'), ('ถือ', 'r')] + """ + from ..transliterate import pronunciate + from ..util.syllable import tone_detector + + _pronunciate = pronunciate(word).split("-") + return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]
+ + + +
+[docs] +def count_thai_chars(text: str) -> dict: + """ + Count Thai characters by type + + This function will give you numbers of Thai characters by type\ + (consonants, vowels, lead_vowels, follow_vowels, above_vowels,\ + below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai) + + :param str text: Text + :return: Dict with numbers of Thai characters by type + :rtype: dict + + :Example: + :: + + from pythainlp.util import count_thai_chars + + count_thai_chars("ทดสอบภาษาไทย") + # output: { + # 'vowels': 3, + # 'lead_vowels': 1, + # 'follow_vowels': 2, + # 'above_vowels': 0, + # 'below_vowels': 0, + # 'consonants': 9, + # 'tonemarks': 0, + # 'signs': 0, + # 'thai_digits': 0, + # 'punctuations': 0, + # 'non_thai': 0 + # } + """ + _dict = { + "vowels": 0, + "lead_vowels": 0, + "follow_vowels": 0, + "above_vowels": 0, + "below_vowels": 0, + "consonants": 0, + "tonemarks": 0, + "signs": 0, + "thai_digits": 0, + "punctuations": 0, + "non_thai": 0, + } + for c in text: + if c in thai_vowels: + _dict["vowels"] += 1 + if c in thai_lead_vowels: + _dict["lead_vowels"] += 1 + elif c in thai_follow_vowels: + _dict["follow_vowels"] += 1 + elif c in thai_above_vowels: + _dict["above_vowels"] += 1 + elif c in thai_below_vowels: + _dict["below_vowels"] += 1 + elif c in thai_consonants: + _dict["consonants"] += 1 + elif c in thai_tonemarks: + _dict["tonemarks"] += 1 + elif c in thai_signs: + _dict["signs"] += 1 + elif c in thai_digits: + _dict["thai_digits"] += 1 + elif c in thai_punctuations: + _dict["punctuations"] += 1 + else: + _dict["non_thai"] += 1 + return _dict
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/thai_lunar_date.html b/5.1/_modules/pythainlp/util/thai_lunar_date.html new file mode 100644 index 0000000..2440afa --- /dev/null +++ b/5.1/_modules/pythainlp/util/thai_lunar_date.html @@ -0,0 +1,539 @@ + + + + + + + + pythainlp.util.thai_lunar_date — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for pythainlp.util.thai_lunar_date

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file is a port from
+> https://gist.github.com/touchiep/99f4f5bb349d6b983ef78697630ab78e
+"""
+
+from datetime import date, timedelta
+from typing import Dict, List, Tuple, Union
+
+_YEAR_DEV: Dict[int, float] = {
+    0: 0,
+    1901: 0.122733000004352,
+    1906: 1.91890000045229e-02,
+    1911: -8.43549999953059e-02,
+    1916: -0.187898999995135,
+    1921: -0.291442999994964,
+    1926: 7.44250000052413e-02,
+    1931: -2.91189999945876e-02,
+    1936: -0.132662999994416,
+    1941: -0.236206999994245,
+    1946: -0.339750999994074,
+    1951: -0.443294999993903,
+    1956: -7.74269999936981e-02,
+    1961: -0.180970999993527,
+    1966: -0.284514999993356,
+    1971: -0.388058999993185,
+    1976: -0.491602999993014,
+    1981: -0.595146999992842,
+    1986: -0.698690999992671,
+    1991: -0.332822999992466,
+    1996: -0.436366999992295,
+    2001: -0.539910999992124,
+    2006: -0.643454999991953,
+    2011: 0.253001000008218,
+    2016: 0.149457000008389,
+    2021: -0.484674999991406,
+    2026: -0.588218999991235,
+    2031: 0.308237000008937,
+    2036: 0.204693000009108,
+    2041: 0.101149000009279,
+    2046: -2.39499999055015e-03,
+    2051: -0.105938999990379,
+    2056: 0.259929000009826,
+    2061: 0.156385000009997,
+    2066: 5.28410000101682e-02,
+    2071: -5.07029999896607e-02,
+    2076: -0.15424699998949,
+    2081: -0.257790999989318,
+    2086: 0.108077000010887,
+    2091: 4.53300001105772e-03,
+    2096: -9.90109999887712e-02,
+    2101: -0.2025549999886,
+    2106: -0.306098999988429,
+    2111: -0.409642999988258,
+    2116: -4.37749999880528e-02,
+    2121: -0.147318999987882,
+    2126: -0.250862999987711,
+    2131: -0.354406999987539,
+    2136: -0.457950999987368,
+    2141: -0.561494999987197,
+    2146: -0.665038999987026,
+    2151: -0.299170999986821,
+    2156: -0.40271499998665,
+    2161: -0.506258999986479,
+    2166: -0.609802999986308,
+    2171: -0.713346999986137,
+    2176: 0.183109000014035,
+    2181: -0.45102299998576,
+    2186: -0.554566999985589,
+    2191: 0.341889000014582,
+    2196: 0.238345000014753,
+    2201: 0.134801000014924,
+    2206: 3.12570000150951e-02,
+    2211: -7.22869999847338e-02,
+    2216: 0.293581000015471,
+    2221: 0.190037000015642,
+    2226: 8.64930000158135e-02,
+    2231: -1.70509999840154e-02,
+    2236: -0.120594999983844,
+    2241: -0.224138999983673,
+    2246: 0.141729000016532,
+    2251: 0.038185000016703,
+    2256: -6.53589999831259e-02,
+    2261: -0.168902999982955,
+    2266: -0.272446999982784,
+    2271: -0.375990999982613,
+    2276: -1.01229999824075e-02,
+    2281: -0.113666999982236,
+    2286: -0.217210999982065,
+    2291: -0.320754999981894,
+    2296: -0.424298999981723,
+    2301: -0.527842999981552,
+    2306: -0.631386999981381,
+    2311: -0.265518999981176,
+    2316: -0.369062999981005,
+    2321: -0.472606999980834,
+    2326: -0.576150999980662,
+    2331: -0.679694999980491,
+    2336: 0.21676100001968,
+    2341: -0.417370999980115,
+    2346: -0.520914999979944,
+    2351: -0.624458999979773,
+    2356: 0.271997000020398,
+    2361: 0.168453000020569,
+    2366: 6.49090000207404e-02,
+    2371: -3.86349999790885e-02,
+    2376: 0.327233000021117,
+    2381: 0.223689000021288,
+    2386: 0.120145000021459,
+    2391: 1.66010000216299e-02,
+    2396: -0.086942999978199,
+    2401: -0.190486999978028,
+    2406: 0.175381000022177,
+    2411: 7.18370000223483e-02,
+    2416: -3.17069999774806e-02,
+    2421: -0.135250999977309,
+    2426: -0.238794999977138,
+    2431: -0.342338999976967,
+    2436: 2.35290000232378e-02,
+    2441: -8.00149999765911e-02,
+    2446: -0.18355899997642,
+    2451: -0.287102999976249,
+    2456: -0.390646999976078,
+}
+
+_BEGIN_DATES = [
+    date(1902, 11, 30),
+    date(1912, 12, 8),
+    date(1922, 11, 19),
+    date(1932, 11, 27),
+    date(1942, 12, 7),
+    date(1952, 11, 16),
+    date(1962, 11, 26),
+    date(1972, 12, 5),
+    date(1982, 11, 15),
+    date(1992, 11, 24),
+    date(2002, 12, 4),
+    date(2012, 11, 13),
+    date(2022, 11, 23),
+    date(2032, 12, 2),
+    date(2042, 12, 12),
+    date(2052, 11, 21),
+    date(2062, 12, 1),
+    date(2072, 12, 9),
+    date(2082, 11, 20),
+    date(2092, 11, 28),
+    date(2102, 12, 9),
+    date(2112, 11, 18),
+    date(2122, 11, 28),
+    date(2132, 12, 7),
+    date(2142, 11, 17),
+    date(2152, 11, 26),
+    date(2162, 12, 6),
+    date(2172, 11, 15),
+    date(2182, 11, 25),
+    date(2192, 12, 4),
+    date(2202, 12, 15),
+    date(2212, 11, 24),
+    date(2222, 12, 4),
+    date(2232, 12, 12),
+    date(2242, 11, 23),
+    date(2252, 12, 1),
+    date(2262, 12, 11),
+    date(2272, 11, 20),
+    date(2282, 11, 30),
+    date(2292, 12, 9),
+    date(2302, 11, 20),
+    date(2312, 11, 29),
+    date(2322, 12, 9),
+    date(2332, 11, 18),
+    date(2342, 11, 28),
+    date(2352, 12, 7),
+    date(2362, 12, 17),
+    date(2372, 11, 26),
+    date(2382, 12, 6),
+    date(2392, 12, 14),
+    date(2402, 11, 25),
+    date(2412, 12, 3),
+    date(2422, 12, 13),
+    date(2432, 11, 23),
+    date(2442, 12, 2),
+    date(2452, 12, 11),
+]
+
+_DAYS_354 = [29, 30, 29, 30, 29, 30, 29, 30, 29, 30, 29, 30, 29, 30]
+_DAYS_355 = [29, 30, 29, 30, 29, 30, 30, 30, 29, 30, 29, 30, 29, 30]
+_DAYS_384 = [29, 30, 29, 30, 29, 30, 29, 30, 30, 29, 30, 29, 30, 29, 30]
+
+# Zodiac names in Thai, English, and Numeric representations
+_ZODIAC: Dict[int, List[Union[str, int]]] = {
+    1: [
+        "ชวด",
+        "ฉลู",
+        "ขาล",
+        "เถาะ",
+        "มะโรง",
+        "มะเส็ง",
+        "มะเมีย",
+        "มะแม",
+        "วอก",
+        "ระกา",
+        "จอ",
+        "กุน",
+    ],
+    2: [
+        "RAT",
+        "OX",
+        "TIGER",
+        "RABBIT",
+        "DRAGON",
+        "SNAKE",
+        "HORSE",
+        "GOAT",
+        "MONKEY",
+        "ROOSTER",
+        "DOG",
+        "PIG",
+    ],
+    3: list(range(1, 13)),
+}
+
+
+def _calculate_f_year_f_dev(year: int) -> Tuple[int, float]:
+    if year in _YEAR_DEV:
+        return year, _YEAR_DEV[year]
+
+    nearest_lower_year = max(y for y in _YEAR_DEV if y < year)
+    return nearest_lower_year, _YEAR_DEV[nearest_lower_year]
+
+
+def athikamas(year: int) -> bool:
+    athi = ((year - 78) - 0.45222) % 2.7118886
+    return athi < 1
+
+
+def athikavar(year: int) -> bool:
+    if athikamas(year):
+        return False
+
+    if athikamas(year + 1):
+        cutoff = 1.69501433191599e-02
+    else:
+        cutoff = -1.42223099315486e-02
+    return deviation(year) > cutoff
+
+
+def deviation(year: int) -> float:
+    curr_dev = 0.0
+    last_dev = 0.0
+    f_year, f_dev = _calculate_f_year_f_dev(year)
+    if year == f_year:
+        curr_dev = f_dev
+    else:
+        f_year = f_year + 1
+        for i in range(f_year, year + 1):
+            if i == f_year:
+                last_dev = f_dev
+            else:
+                last_dev = curr_dev
+            if athikamas(i - 1):
+                curr_dev = -0.102356
+            elif athikavar(i - 1):
+                curr_dev = -0.632944
+            else:
+                curr_dev = 0.367056
+            curr_dev = last_dev + curr_dev
+
+    return curr_dev
+
+
+def last_day_in_year(year: int) -> int:
+    if athikamas(year):
+        return 384
+    elif athikavar(year):
+        return 355
+
+    return 354
+
+
+def athikasurathin(year: int) -> bool:
+    """
+    Check if a year is a leap year in the Thai lunar calendar
+    """
+    # Check divisibility by 400 (divisible by 400 is always a leap year)
+    if year % 400 == 0:
+        return True
+
+    # Check divisibility by 100 (divisible by 100 but not 400 is not a leap
+    # year)
+    elif year % 100 == 0:
+        return False
+
+    # Check divisibility by 4 (divisible by 4 but not by 100 is a leap year)
+    elif year % 4 == 0:
+        return True
+
+    # All other cases are not leap years
+    return False
+
+
+def number_day_in_year(year: int) -> int:
+    if athikasurathin(year):
+        return 366
+
+    return 365
+
+
+
+[docs] +def th_zodiac(year: int, output_type: int = 1) -> Union[str, int]: + """ + Thai Zodiac Year Name + Converts a Gregorian year to its corresponding Zodiac name. + + :param int year: The Gregorian year. AD (Anno Domini) + :param int output_type: Output type (1 = Thai, 2 = English, 3 = Number). + + :return: The Zodiac name or number corresponding to the input year. + :rtype: Union[str, int] + """ + # Calculate zodiac index + result = year % 12 + if result - 3 < 1: + result = result - 3 + 12 + else: + result = result - 3 + + # Return the zodiac based on the output type + return _ZODIAC[output_type][result - 1]
+ + + +
+[docs] +def to_lunar_date(input_date: date) -> str: + """ + Convert the solar date to Thai Lunar Date + + :param date input_date: date of the day. + :return: Thai text lunar date + :rtype: str + """ + # Check if date is within supported range + if input_date.year < 1903 or input_date.year > 2460: + raise NotImplementedError("Unsupported date") # Unsupported date + + # Choose the nearest begin date + c_year = input_date.year - 1 + begin_date = _BEGIN_DATES[0] + for _date in reversed(_BEGIN_DATES): + if c_year > _date.year: + begin_date = _date + break + + current_date = begin_date + for year in range(begin_date.year + 1, input_date.year): + day_in_year = last_day_in_year(year) + current_date += timedelta(days=day_in_year) + + r_day_prev = (date(current_date.year, 12, 31) - current_date).days + day_of_year = (input_date - date(input_date.year, 1, 1)).days + day_from_one = r_day_prev + day_of_year + 1 + last_day = last_day_in_year(input_date.year) + + if last_day == 354: + days_in_month = _DAYS_354 + elif last_day == 355: + days_in_month = _DAYS_355 + elif last_day == 384: + days_in_month = _DAYS_384 + + days_of_year = day_from_one + for j, days in enumerate(days_in_month, start=1): + th_m = j + if 0 < days_of_year <= days: + break + else: + days_of_year -= days + + if last_day <= 355: # 354 or 355 + if th_m > 12: + th_m = th_m - 12 + elif last_day == 384: + if th_m > 13: + th_m = th_m - 13 + if th_m >= 9 and th_m <= 13: + th_m = th_m - 1 + + if days_of_year > 15: + th_s = "แรม" + days_of_year = days_of_year - 15 + else: + th_s = "ขึ้น" + + thai_lunar_date = f"{th_s} {days_of_year} ค่ำ เดือน {th_m}" + + return thai_lunar_date
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/time.html b/5.1/_modules/pythainlp/util/time.html new file mode 100644 index 0000000..8cb02ca --- /dev/null +++ b/5.1/_modules/pythainlp/util/time.html @@ -0,0 +1,472 @@ + + + + + + + + pythainlp.util.time — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.time

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Spell out time as Thai words.
+
+Convert time string or time object to Thai words.
+"""
+from datetime import datetime, time
+from typing import Union
+
+from pythainlp.tokenize import Tokenizer
+from pythainlp.util.numtoword import num_to_thaiword
+from pythainlp.util.wordtonum import thaiword_to_num
+
+_TIME_FORMAT_WITH_SEC = "%H:%M:%S"
+_TIME_FORMAT_WITHOUT_SEC = "%H:%M"
+_DICT_THAI_TIME = {
+    "ศูนย์": 0,
+    "หนึ่ง": 1,
+    "สอง": 2,
+    "ยี่": 2,
+    "สาม": 3,
+    "สี่": 4,
+    "ห้า": 5,
+    "หก": 6,
+    "เจ็ด": 7,
+    "แปด": 8,
+    "เก้า": 9,
+    "สิบ": 10,
+    "เอ็ด": 1,
+    # set the value of the time unit
+    "โมงเช้า": 6,  # start counting at 7:00 a.m.
+    "โมงเย็น": 13,
+    "บ่าย": 13,
+    "บ่ายโมง": 13,
+    "ตี": 0,
+    "เที่ยงวัน": 12,
+    "เที่ยงคืน": 0,
+    "เที่ยง": 12,
+    "ทุ่ม": 18,
+    "นาฬิกา": 0,
+    "ครึ่ง": 30,
+}
+_THAI_TIME_CUT = Tokenizer(
+    custom_dict=list(_DICT_THAI_TIME.keys()), engine="newmm"
+)
+_THAI_TIME_AFFIX = [
+    "โมงเช้า",
+    "บ่ายโมง",
+    "โมงเย็น",
+    "โมง",
+    "นาฬิกา",
+    "ทุ่ม",
+    "ตี",
+    "เที่ยงคืน",
+    "เที่ยงวัน",
+    "เที่ยง",
+]
+
+
+def _format_6h(h: int) -> str:
+    """Thai time (6-hour clock)."""
+    text = ""
+
+    if h == 0:
+        text += "เที่ยงคืน"
+    elif h < 7:
+        text += "ตี" + num_to_thaiword(h)
+    elif h < 12:
+        text += num_to_thaiword(h - 6) + "โมงเช้า"
+    elif h == 12:
+        text += "เที่ยง"
+    elif h < 18:
+        if h == 13:
+            text += "บ่ายโมง"
+        else:
+            text += "บ่าย" + num_to_thaiword(h - 12) + "โมง"
+    elif h == 18:
+        text += "หกโมงเย็น"
+    else:
+        text += num_to_thaiword(h - 18) + "ทุ่ม"
+
+    return text
+
+
+def _format_m6h(h: int) -> str:
+    """Thai time (modified 6-hour clock)."""
+    text = ""
+
+    if h == 0:
+        text += "เที่ยงคืน"
+    elif h < 6:
+        text += "ตี" + num_to_thaiword(h)
+    elif h < 12:
+        text += num_to_thaiword(h) + "โมง"
+    elif h == 12:
+        text += "เที่ยง"
+    elif h < 19:
+        text += num_to_thaiword(h - 12) + "โมง"
+    else:
+        text += num_to_thaiword(h - 18) + "ทุ่ม"
+
+    return text
+
+
+def _format_24h(h: int) -> str:
+    """Thai time (24-hour clock)."""
+    text = num_to_thaiword(h) + "นาฬิกา"
+    return text
+
+
+def _format(
+    h: int,
+    m: int,
+    s: int,
+    fmt: str = "24h",
+    precision: Union[str, None] = None,
+) -> str:
+    text = ""
+    if fmt == "6h":
+        text = _format_6h(h)
+    elif fmt == "m6h":
+        text = _format_m6h(h)
+    elif fmt == "24h":
+        text = _format_24h(h)
+    else:
+        raise NotImplementedError(f"Time format not supported: {fmt}")
+
+    if precision in ("m", "s"):
+        if m == 30 and (s == 0 or precision == "m") and (fmt in ("6h", "m6h")):
+            text += "ครึ่ง"
+        else:
+            text += num_to_thaiword(m) + "นาที"
+            if precision == "s":
+                text += num_to_thaiword(s) + "วินาที"
+    else:
+        if m:
+            if m == 30 and s == 0 and (fmt in ("6h", "m6h")):
+                text += "ครึ่ง"
+            else:
+                text += num_to_thaiword(m) + "นาที"
+        if s:
+            text += num_to_thaiword(s) + "วินาที"
+
+    return text
+
+
+
+[docs] +def time_to_thaiword( + time_data: Union[time, datetime, str], + fmt: str = "24h", + precision: Union[str, None] = None, +) -> str: + """ + Spell out time as Thai words. + + :param str time_data: time input, can be a datetime.time object \ + or a datetime.datetime object \ + or a string (in H:M or H:M:S format, using 24-hour clock) + :param str fmt: time output format + * *24h* - 24-hour clock (default) + * *6h* - 6-hour clock + * *m6h* - Modified 6-hour clock + :param str precision: precision of the spell out time + * *m* - always spell out at minute level + * *s* - always spell out at second level + * None - spell out only non-zero parts + :return: Time spelled out as Thai words + :rtype: str + + :Example: + :: + + time_to_thaiword("8:17") + # output: + # แปดนาฬิกาสิบเจ็ดนาที + + time_to_thaiword("8:17", "6h") + # output: + # สองโมงเช้าสิบเจ็ดนาที + + time_to_thaiword("8:17", "m6h") + # output: + # แปดโมงสิบเจ็ดนาที + + time_to_thaiword("18:30", fmt="m6h") + # output: + # หกโมงครึ่ง + + time_to_thaiword(datetime.time(12, 3, 0)) + # output: + # สิบสองนาฬิกาสามนาที + + time_to_thaiword(datetime.time(12, 3, 0), precision="s") + # output: + # สิบสองนาฬิกาสามนาทีศูนย์วินาที + """ + _time = None + + if isinstance(time_data, (time, datetime)): + _time = time_data + else: + if not isinstance(time_data, str): + raise TypeError( + "Time input must be a datetime.time object, " + "a datetime.datetime object, or a string." + ) + + if not time_data: + raise ValueError("Time string cannot be empty.") + + try: + _time = datetime.strptime(time_data, _TIME_FORMAT_WITH_SEC) + except ValueError: + try: + _time = datetime.strptime(time_data, _TIME_FORMAT_WITHOUT_SEC) + except ValueError: + pass + + if not _time: + raise ValueError( + f"Time string '{time_data}' does not match H:M or H:M:S format." + ) + + text = _format(_time.hour, _time.minute, _time.second, fmt, precision) + + return text
+ + + +
+[docs] +def thaiword_to_time(text: str, padding: bool = True) -> str: + """ + Convert Thai time in words into time (H:M). + + :param str text: Thai time in words + :param bool padding: Zero pad the hour if True + + :return: time string + :rtype: str + + :Example: + :: + + thaiword_to_time"บ่ายโมงครึ่ง") + # output: + # 13:30 + """ + keys_dict = list(_DICT_THAI_TIME.keys()) + text = text.replace("กว่า", "").replace("ๆ", "").replace(" ", "") + _i = ["ตีหนึ่ง", "ตีสอง", "ตีสาม", "ตีสี่", "ตีห้า"] + _time = "" + for affix in _THAI_TIME_AFFIX: + if affix in text and affix != "ตี": + _time = text.replace(affix, affix + "|") + break + elif affix in text and affix == "ตี": + for j in _i: + if j in text: + _time = text.replace(j, j + "|") + break + else: + pass + if "|" not in _time: + raise ValueError("Cannot find any Thai word for time affix.") + + _LIST_THAI_TIME = _time.split("|") + del _time + + hour = _THAI_TIME_CUT.word_tokenize(_LIST_THAI_TIME[0]) + minute = _LIST_THAI_TIME[1] + if len(minute) > 1: + minute = _THAI_TIME_CUT.word_tokenize(minute) + else: + minute = 0 + text = "" + + # determine hour + if hour[-1] == "นาฬิกา" and hour[0] in keys_dict and hour[:-1]: + text += str(thaiword_to_num("".join(hour[:-1]))) + elif hour[0] == "ตี" and hour[1] in keys_dict: + text += str(_DICT_THAI_TIME[hour[1]]) + elif hour[-1] == "โมงเช้า" and hour[0] in keys_dict: + if _DICT_THAI_TIME[hour[0]] < 6: + text += str(_DICT_THAI_TIME[hour[0]] + 6) + else: + text += str(_DICT_THAI_TIME[hour[0]]) + elif (hour[-1] == "โมงเย็น" or hour[-1] == "โมง") and hour[0] == "บ่าย": + text += str(_DICT_THAI_TIME[hour[1]] + 12) + elif (hour[-1] == "โมงเย็น" or hour[-1] == "โมง") and hour[0] in keys_dict: + text += str(_DICT_THAI_TIME[hour[0]] + 12) + elif hour[-1] == "เที่ยงคืน": + text += "0" + elif hour[-1] == "เที่ยงวัน" or hour[-1] == "เที่ยง": + text += "12" + elif hour[0] == "บ่ายโมง": + text += "13" + elif hour[-1] == "ทุ่ม": + if len(hour) == 1: + text += "19" + else: + text += str(_DICT_THAI_TIME[hour[0]] + 18) + + if not text: + raise ValueError("Cannot find any Thai word for hour.") + + if padding and len(text) == 1: + text = "0" + text + text += ":" + + # determine minute + if minute: + n = 0 + for affix in minute: + if affix in keys_dict: + if affix != "สิบ": + n += _DICT_THAI_TIME[affix] + elif affix == "สิบ" and n != 0: + n *= 10 + elif affix == "สิบ" and n == 0: + n += 10 + if n != 0 and n > 9: + text += str(n) + else: + text += "0" + str(n) + else: + text += "00" + + return text
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/trie.html b/5.1/_modules/pythainlp/util/trie.html new file mode 100644 index 0000000..e991816 --- /dev/null +++ b/5.1/_modules/pythainlp/util/trie.html @@ -0,0 +1,292 @@ + + + + + + + + pythainlp.util.trie — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.trie

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Trie data structure.
+
+Designed to be used for tokenizer's dictionary, but can be for other purposes.
+"""
+from typing import Iterable, Iterator, List, Union
+
+
+
+[docs] +class Trie(Iterable[str]): +
+[docs] + class Node: + __slots__ = "end", "children" + +
+[docs] + def __init__(self): + self.end = False + self.children = {}
+
+ + +
+[docs] + def __init__(self, words: Iterable[str]): + self.words = set(words) + self.root = Trie.Node() + + for word in words: + self.add(word)
+ + +
+[docs] + def add(self, word: str) -> None: + """ + Add a word to the trie. + Spaces in front of and following the word will be removed. + + :param str text: a word + """ + word = word.strip() + self.words.add(word) + cur = self.root + for ch in word: + child = cur.children.get(ch) + if not child: + child = Trie.Node() + cur.children[ch] = child + cur = child + cur.end = True
+ + +
+[docs] + def remove(self, word: str) -> None: + """ + Remove a word from the trie. + If the word is not found, do nothing. + + :param str text: a word + """ + # remove from set first + if word not in self.words: + return + self.words.remove(word) + # then remove from nodes + parent = self.root + data = [] # track path to leaf + for ch in word: + child = parent.children[ch] + data.append((parent, child, ch)) + parent = child + # remove the last one + child.end = False + # prune up the tree + for parent, child, ch in reversed(data): + if child.end or child.children: + break + del parent.children[ch] # remove from parent dict
+ + +
+[docs] + def prefixes(self, text: str) -> List[str]: + """ + List all possible words from first sequence of characters in a word. + + :param str text: a word + :return: a list of possible words + :rtype: List[str] + """ + res = [] + cur = self.root + for i, ch in enumerate(text): + node = cur.children.get(ch) + if not node: + break + if node.end: + res.append(text[: i + 1]) + cur = node + return res
+ + + def __contains__(self, key: str) -> bool: + return key in self.words + + def __iter__(self) -> Iterator[str]: + yield from self.words + + def __len__(self) -> int: + return len(self.words)
+ + + +
+[docs] +def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: + """ + Create a dictionary trie from a file or an iterable. + + :param str|Iterable[str]|pythainlp.util.Trie dict_source: a path to + dictionary file or a list of words or a pythainlp.util.Trie object + :return: a trie object + :rtype: pythainlp.util.Trie + """ + trie = Trie([]) + + if isinstance(dict_source, str) and len(dict_source) > 0: + # dict_source is a path to dictionary text file + with open(dict_source, "r", encoding="utf8") as f: + _vocabs = f.read().splitlines() + trie = Trie(_vocabs) + elif isinstance(dict_source, Iterable) and not isinstance( + dict_source, str + ): + # Note: Since Trie and str are both Iterable, + # so the Iterable check should be here, at the very end, + # because it has less specificality + trie = Trie(dict_source) + else: + raise TypeError( + "Type of dict_source must be pythainlp.util.Trie, " + "or Iterable[str], or non-empty str (path to source file)" + ) + + return trie
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/util/wordtonum.html b/5.1/_modules/pythainlp/util/wordtonum.html new file mode 100644 index 0000000..7019c21 --- /dev/null +++ b/5.1/_modules/pythainlp/util/wordtonum.html @@ -0,0 +1,364 @@ + + + + + + + + pythainlp.util.wordtonum — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.util.wordtonum

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Convert number in words to a computable number value
+
+First version of the code adapted from Korakot Chaovavanich's notebook
+https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q
+"""
+import re
+from typing import List
+
+from pythainlp.corpus import thai_words
+from pythainlp.tokenize import Tokenizer
+
+_ptn_digits = r"(|หนึ่ง|เอ็ด|สอง|ยี่|สาม|สี่|ห้า|หก|เจ็ด|แปด|เก้า)"
+_ptn_six_figures = (
+    rf"({_ptn_digits}แสน)?({_ptn_digits}หมื่น)?({_ptn_digits}พัน)?"
+    rf"({_ptn_digits}ร้อย)?({_ptn_digits}สิบ)?{_ptn_digits}?"
+)
+_ptn_thai_numerals = rf"(ลบ)?({_ptn_six_figures}ล้าน)*{_ptn_six_figures}"
+_re_thai_numerals = re.compile(_ptn_thai_numerals)
+
+_digits = {
+    # "ศูนย์" was excluded as a special case
+    "หนึ่ง": 1,
+    "เอ็ด": 1,
+    "สอง": 2,
+    "ยี่": 2,
+    "สาม": 3,
+    "สี่": 4,
+    "ห้า": 5,
+    "หก": 6,
+    "เจ็ด": 7,
+    "แปด": 8,
+    "เก้า": 9,
+}
+_powers_of_10 = {
+    "สิบ": 10,
+    "ร้อย": 100,
+    "พัน": 1000,
+    "หมื่น": 10000,
+    "แสน": 100000,
+    # "ล้าน" was excluded as a special case
+}
+_valid_tokens = (
+    set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน", "ลบ"}
+)
+_tokenizer = Tokenizer(custom_dict=_valid_tokens)
+
+
+def _check_is_thainum(word: str):
+    for j in list(_digits.keys()):
+        if j in word:
+            return (True, "num")
+    for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]:
+        if j in word:
+            return (True, "unit")
+    return (False, None)
+
+
+_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]]
+_dict_words += list(_digits.keys())
+_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]
+
+_tokenizer_thaiwords = Tokenizer(_dict_words)
+
+
+
+[docs] +def thaiword_to_num(word: str) -> int: + """ + Converts the spelled-out numerals in Thai scripts into an actual integer. + + :param str word: Spelled-out numerals in Thai scripts + :return: Corresponding integer value of the input + :rtype: int + + :Example: + :: + + from pythainlp.util import thaiword_to_num + + thaiword_to_num("ศูนย์") + # output: 0 + + thaiword_to_num("สองล้านสามแสนหกร้อยสิบสอง") + # output: 2300612 + + """ + if not isinstance(word, str): + raise TypeError(f"The input must be a string; given {word!r}") + if not word: + raise ValueError("The input string cannot be empty") + if word == "ศูนย์": + return 0 + if not _re_thai_numerals.fullmatch(word): + raise ValueError("The input string is not a valid Thai numeral") + + tokens = _tokenizer.word_tokenize(word) + accumulated = 0 + next_digit = 1 + + is_minus = False + if tokens[0] == "ลบ": + is_minus = True + tokens.pop(0) + + for token in tokens: + if token in _digits: + next_digit = _digits[token] + elif token in _powers_of_10: + # Absent digit assumed 1 before all powers of 10 (except million) + accumulated += max(next_digit, 1) * _powers_of_10[token] + next_digit = 0 + else: # token == "ล้าน" + # Absent digit assumed 0 before word million + accumulated = (accumulated + next_digit) * 1000000 + next_digit = 0 + + # Cleaning up trailing digit + accumulated += next_digit + + if is_minus: + accumulated = -accumulated + + return accumulated
+ + + +def _decimal_unit(words: list) -> float: + _num = 0.0 + for i, v in enumerate(words): + _num += int(thaiword_to_num(v)) / (10 ** (i + 1)) + return _num + + +
+[docs] +def words_to_num(words: list) -> float: + """ + Thai Words to float + + :param str text: Thai words + :return: float of words + :rtype: float + + :Example: + :: + + from pythainlp.util import words_to_num + + words_to_num(["ห้า", "สิบ", "จุด", "เก้า", "ห้า"]) + # output: 50.95 + + """ + num = 0 + if "จุด" not in words: + num = thaiword_to_num("".join(words)) + else: + words_int = "".join(words[: words.index("จุด")]) + words_float = words[words.index("จุด") + 1 :] + num = thaiword_to_num(words_int) + if num <= -1: + num -= _decimal_unit(words_float) + else: + num += _decimal_unit(words_float) + + return num
+ + + +
+[docs] +def text_to_num(text: str) -> List[str]: + """ + Thai text to list of Thai words with floating point numbers + + :param str text: Thai text with the spelled-out numerals + :return: list of Thai words with float values of the input + :rtype: List[str] + + :Example: + :: + + from pythainlp.util import text_to_num + + text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด") + # output: ['980.95', 'บาท', 'นี่', 'คือ', 'จำนวน', 'ทั้งหมด'] + + text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท") + # output: ['10021889', 'บาท'] + + """ + _temp = _tokenizer_thaiwords.word_tokenize(text) + thainum = [] + last_index = -1 + list_word_new = [] + for i, word in enumerate(_temp): + if ( + _check_is_thainum(word)[0] + and last_index + 1 == i + and i + 1 == len(_temp) + ): + thainum.append(word) + list_word_new.append(str(words_to_num(thainum))) + elif _check_is_thainum(word)[0] and last_index + 1 == i: + thainum.append(word) + last_index = i + elif _check_is_thainum(word)[0]: + thainum.append(word) + last_index = i + elif ( + not _check_is_thainum(word)[0] + and last_index + 1 == i + and last_index != -1 + ): + list_word_new.append(str(words_to_num(thainum))) + thainum = [] + list_word_new.append(word) + else: + list_word_new.append(word) + last_index = -1 + return list_word_new
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/wangchanberta/core.html b/5.1/_modules/pythainlp/wangchanberta/core.html new file mode 100644 index 0000000..295007b --- /dev/null +++ b/5.1/_modules/pythainlp/wangchanberta/core.html @@ -0,0 +1,398 @@ + + + + + + + + pythainlp.wangchanberta.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.wangchanberta.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+import re
+import warnings
+from typing import List, Tuple, Union
+
+from transformers import (
+    CamembertTokenizer,
+    pipeline,
+)
+
+from pythainlp.tokenize import word_tokenize
+
+_model_name = "wangchanberta-base-att-spm-uncased"
+_tokenizer = CamembertTokenizer.from_pretrained(
+    f"airesearch/{_model_name}", revision="main"
+)
+if _model_name == "wangchanberta-base-att-spm-uncased":
+    _tokenizer.additional_special_tokens = ["<s>NOTUSED", "</s>NOTUSED", "<_>"]
+
+
+
+[docs] +class ThaiNameTagger: +
+[docs] + def __init__( + self, dataset_name: str = "thainer", grouped_entities: bool = True + ): + """ + This function tags named entities in text in IOB format. + + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + + :param str dataset_name: + * *thainer* - ThaiNER dataset + :param bool grouped_entities: grouped entities + """ + self.dataset_name = dataset_name + self.grouped_entities = grouped_entities + self.classify_tokens = pipeline( + task="ner", + tokenizer=_tokenizer, + model=f"airesearch/{_model_name}", + revision=f"finetuned@{self.dataset_name}-ner", + ignore_labels=[], + grouped_entities=self.grouped_entities, + )
+ + + def _IOB(self, tag): + if tag != "O": + return "B-" + tag + return "O" + + def _clear_tag(self, tag): + return tag.replace("B-", "").replace("I-", "") + +
+[docs] + def get_ner( + self, text: str, pos: bool = False, tag: bool = False + ) -> Union[List[Tuple[str, str]], str]: + """ + This function tags named entities in text in IOB format. + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + + :param str text: text in Thai to be tagged + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized word groups,\ + NER tags, and output HTML-like tags (if the parameter `tag` is \ + specified as `True`). \ + Otherwise, return a list of tuples associated with tokenized \ + words and NER tags + :rtype: Union[list[tuple[str, str]]], str + """ + if pos: + warnings.warn( + "This model doesn't support output of POS tags and it doesn't output the POS tags." + ) + text = re.sub(" ", "<_>", text) + self.json_ner = self.classify_tokens(text) + self.output = "" + if self.grouped_entities and self.dataset_name == "thainer": + self.sent_ner = [ + ( + i["word"].replace("<_>", " ").replace("▁", ""), + self._IOB(i["entity_group"]), + ) + for i in self.json_ner + ] + elif self.dataset_name == "thainer": + self.sent_ner = [ + (i["word"].replace("<_>", " ").replace("▁", ""), i["entity"]) + for i in self.json_ner + if i["word"] != "▁" + ] + else: + self.sent_ner = [ + ( + i["word"].replace("<_>", " ").replace("▁", ""), + i["entity"].replace("_", "-").replace("E-", "I-"), + ) + for i in self.json_ner + ] + if self.sent_ner[0][0] == "" and len(self.sent_ner) > 1: + self.sent_ner = self.sent_ner[1:] + for idx, (word, ner) in enumerate(self.sent_ner): + if idx > 0 and ner.startswith("B-"): + if self._clear_tag(ner) == self._clear_tag( + self.sent_ner[idx - 1][1] + ): + self.sent_ner[idx] = (word, ner.replace("B-", "I-")) + if tag: + temp = "" + sent = "" + for idx, (word, ner) in enumerate(self.sent_ner): + if ner.startswith("B-") and temp != "": + sent += "</" + temp + ">" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "</" + temp + ">" + temp = "" + sent += word + + if idx == len(self.sent_ner) - 1 and temp != "": + sent += "</" + temp + ">" + + return sent + else: + return self.sent_ner
+
+ + + +
+[docs] +class NamedEntityRecognition: +
+[docs] + def __init__( + self, model: str = "pythainlp/thainer-corpus-v2-base-model" + ) -> None: + """ + This function tags named entities in text in IOB format. + + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + :param str model: The model that use wangchanberta pretrained. + """ + from transformers import AutoModelForTokenClassification, AutoTokenizer + + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model)
+ + + def _fix_span_error(self, words, ner): + _ner = [] + _ner = ner + _new_tag = [] + for i, j in zip(words, _ner): + i = self.tokenizer.decode(i) + if i.isspace() and j.startswith("B-"): + j = "O" + if i in ("", "<s>", "</s>"): + continue + if i == "<_>": + i = " " + _new_tag.append((i, j)) + return _new_tag + +
+[docs] + def get_ner( + self, text: str, pos: bool = False, tag: bool = False + ) -> Union[List[Tuple[str, str]], str]: + """ + This function tags named entities in text in IOB format. + Powered by wangchanberta from VISTEC-depa\ + AI Research Institute of Thailand + + :param str text: text in Thai to be tagged + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized word groups, NER tags, \ + and output HTML-like tags (if the parameter `tag` is \ + specified as `True`). \ + Otherwise, return a list of tuples associated with tokenized \ + words and NER tags + :rtype: Union[list[tuple[str, str]]], str + """ + import torch + + if pos: + warnings.warn( + "This model doesn't support output postag and It doesn't output the postag." + ) + words_token = word_tokenize(text.replace(" ", "<_>")) + inputs = self.tokenizer( + words_token, is_split_into_words=True, return_tensors="pt" + ) + ids = inputs["input_ids"] + mask = inputs["attention_mask"] + # forward pass + outputs = self.model(ids, attention_mask=mask) + logits = outputs[0] + predictions = torch.argmax(logits, dim=2) + predicted_token_class = [ + self.model.config.id2label[t.item()] for t in predictions[0] + ] + ner_tag = self._fix_span_error( + inputs["input_ids"][0], predicted_token_class + ) + if tag: + temp = "" + sent = "" + for idx, (word, ner) in enumerate(ner_tag): + if ner.startswith("B-") and temp != "": + sent += "</" + temp + ">" + temp = ner[2:] + sent += "<" + temp + ">" + elif ner.startswith("B-"): + temp = ner[2:] + sent += "<" + temp + ">" + elif ner == "O" and temp != "": + sent += "</" + temp + ">" + temp = "" + sent += word + + if idx == len(ner_tag) - 1 and temp != "": + sent += "</" + temp + ">" + + return sent + return ner_tag
+
+ + + +
+[docs] +def segment(text: str) -> List[str]: + """ + Subword tokenize. SentencePiece from wangchanberta model. + + :param str text: text to be tokenized + :return: list of subwords + :rtype: list[str] + """ + if not text or not isinstance(text, str): + return [] + + return _tokenizer.tokenize(text)
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/word_vector/core.html b/5.1/_modules/pythainlp/word_vector/core.html new file mode 100644 index 0000000..85c1f5d --- /dev/null +++ b/5.1/_modules/pythainlp/word_vector/core.html @@ -0,0 +1,484 @@ + + + + + + + + pythainlp.word_vector.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.word_vector.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+
+from gensim.models import KeyedVectors
+from gensim.models.keyedvectors import Word2VecKeyedVectors
+from numpy import ndarray, zeros
+
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tokenize import THAI2FIT_TOKENIZER, word_tokenize
+
+WV_DIM = 300  # word vector dimension
+
+_MODEL_NAME = "thai2fit_wv"
+
+_TK_SP = "xxspace"
+_TK_EOL = "xxeol"
+
+
+
+[docs] +class WordVector: + """ + Word Vector class + + :param str model_name: model name + + **Options for model_name** + * *thai2fit_wv* (default) - word vector from thai2fit + * *ltw2v* - word vector from LTW2V: The Large Thai Word2Vec v0.1 + * *ltw2v_v1.0_15_window* - word vector from LTW2V v1.0 and 15 window + * *ltw2v_v1.0_5_window* - word vector from LTW2V v1.0 and 5 window + """ + +
+[docs] + def __init__(self, model_name: str = "thai2fit_wv") -> None: + """ + Word Vector class + + :param str model_name: model name + + **Options for model_name** + * *thai2fit_wv* (default) - word vector from thai2fit + * *ltw2v* - word vector from LTW2V: The Large Thai Word2Vec + * *ltw2v_v1.0_15_window* - word2vec from LTW2V 1.0 and 15 window + * *ltw2v_v1.0_5_window* - word2vec from LTW2V v1.0 and 5 window + """ + self.load_wordvector(model_name)
+ + +
+[docs] + def load_wordvector(self, model_name: str): + """ + Load word vector model. + + :param str model_name: model name + """ + self.model_name = model_name + self.model = KeyedVectors.load_word2vec_format( + get_corpus_path(self.model_name), + binary=True, + unicode_errors="ignore", + ) + self.WV_DIM = self.model.vector_size + + if self.model_name == "thai2fit_wv": + self.tokenize = THAI2FIT_TOKENIZER.word_tokenize + else: + self.tokenize = word_tokenize
+ + +
+[docs] + def get_model(self) -> Word2VecKeyedVectors: + """ + Get word vector model. + + :return: `gensim` word2vec model + :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors + """ + return self.model
+ + +
+[docs] + def doesnt_match(self, words: List[str]) -> str: + """ + This function returns one word that is mostly unrelated to other words + in the list. We use the function :func:`doesnt_match` + from :mod:`gensim`. + + :param list words: a list of words + :raises KeyError: if there is any word in `positive` or `negative` that is + not in the vocabulary of the model. + :return: the word is that mostly unrelated + :rtype: str + + :Note: + * If a word in `words` is not in the vocabulary, :class:`KeyError` + will be raised. + + :Example: + Pick the word "พริกไทย" (name of food) out of the list of meals + ("อาหารเช้า", "อาหารเที่ยง", "อาหารเย็น"). + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> words = ['อาหารเช้า', 'อาหารเที่ยง', 'อาหารเย็น', 'พริกไทย'] + >>> wv.doesnt_match(words) + พริกไทย + + Pick the word "เรือ" (name of vehicle) out of the list of words + related to occupation ("ดีไซน์เนอร์", "พนักงานเงินเดือน", "หมอ"). + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> words = ['ดีไซน์เนอร์', 'พนักงานเงินเดือน', 'หมอ', 'เรือ'] + >>> wv.doesnt_match(words) + เรือ + """ + return self.model.doesnt_match(words)
+ + +
+[docs] + def most_similar_cosmul( + self, positive: List[str], negative: List[str] + ) -> List[Tuple[str, float]]: + """ + This function finds the top-10 words that are most similar with respect + to two lists of words labeled as positive and negative. + The top-10 most similar words are obtained using multiplication + combination objective from Omer Levy and Yoav Goldberg + [OmerLevy_YoavGoldberg_2014]_. + + We use the function :func:`gensim.most_similar_cosmul` directly from + :mod:`gensim`. + + :param list positive: a list of words to add + :param list negative: a list of words to subtract + + :raises KeyError: if there is any word in `positive` or `negative` that is + not in the vocabulary of the model. + :return: list of top-10 most similar words and its similarity score + :rtype: list[tuple[str, float]] + + :Note: + * With a single word in the positive list, it will find the + most similar words to the word given (similar + to :func:`gensim.most_similar`) + * If a word in `positive` or `negative` is not in the vocabulary, + :class:`KeyError` will be raised. + + :Example: + + Find the **top-10** most similar words to the word: "แม่น้ำ". + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> list_positive = ['แม่น้ำ'] + >>> list_negative = [] + >>> wv.most_similar_cosmul(list_positive, list_negative) + [('ลำน้ำ', 0.8206598162651062), ('ทะเลสาบ', 0.775945782661438), + ('ลุ่มน้ำ', 0.7490593194961548), ('คลอง', 0.7471904754638672), + ('ปากแม่น้ำ', 0.7354257106781006), ('ฝั่งแม่น้ำ', 0.7120099067687988), + ('ทะเล', 0.7030453681945801), ('ริมแม่น้ำ', 0.7015200257301331), + ('แหล่งน้ำ', 0.6997432112693787), ('ภูเขา', 0.6960948705673218)] + + Find the **top-10** most similar words to the words: "นายก", + "รัฐมนตรี", and "ประเทศ". + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> list_positive = ['นายก', 'รัฐมนตรี', 'ประเทศ'] + >>> list_negative = [] + >>> wv.most_similar_cosmul(list_positive, list_negative) + [('รองนายกรัฐมนตรี', 0.2730445861816406), + ('เอกอัครราชทูต', 0.26500266790390015), + ('นายกรัฐมนตรี', 0.2649088203907013), + ('ผู้ว่าราชการจังหวัด', 0.25119125843048096), + ('ผู้ว่าการ', 0.2510434687137604), ('เลขาธิการ', 0.24824175238609314), + ('ผู้ว่า', 0.2453523576259613), ('ประธานกรรมการ', 0.24147476255893707), + ('รองประธาน', 0.24123257398605347), ('สมาชิกวุฒิสภา', + 0.2405330240726471)] + + Find the **top-10** most similar words when having **only** positive + list and **both** positive and negative lists. + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น'] + >>> list_negative = [] + >>> wv.most_similar_cosmul(list_positive, list_negative) + [('ประเทศจีน', 0.22022421658039093), ('เกาหลี', 0.2196873426437378), + ('สหรัฐอเมริกา', 0.21660110354423523), + ('ประเทศญี่ปุ่น', 0.21205860376358032), + ('ประเทศไทย', 0.21159221231937408), ('เกาหลีใต้', + 0.20321202278137207), + ('อังกฤษ', 0.19610872864723206), ('ฮ่องกง', 0.1928885132074356), + ('ฝรั่งเศส', 0.18383873999118805), ('พม่า', 0.18369348347187042)] + >>> + >>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น'] + >>> list_negative = ['อเมริกา'] + >>> wv.most_similar_cosmul(list_positive, list_negative) + [('ประเทศไทย', 0.3278159201145172), ('เกาหลี', 0.3201899230480194), + ('ประเทศจีน', 0.31755179166793823), ('พม่า', 0.30845439434051514), + ('ประเทศญี่ปุ่น', 0.306713730096817), + ('เกาหลีใต้', 0.3003999888896942), + ('ลาว', 0.2995176911354065), ('คนไทย', 0.2885020673274994), + ('เวียดนาม', 0.2878379821777344), ('ชาวไทย', 0.28480708599090576)] + + The function returns :class:`KeyError` when the term "เมนูอาหารไทย" + is not in the vocabulary. + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> list_positive = ['เมนูอาหารไทย'] + >>> list_negative = [] + >>> wv.most_similar_cosmul(list_positive, list_negative) + KeyError: "word 'เมนูอาหารไทย' not in vocabulary" + """ + return self.model.most_similar_cosmul( + positive=positive, negative=negative + )
+ + +
+[docs] + def similarity(self, word1: str, word2: str) -> float: + """ + This function computes cosine similarity between two words. + + :param str word1: first word to be compared with + :param str word2: second word to be compared with + + :raises KeyError: if either `word1` or `word2` is not in the + vocabulary of the model. + :return: the cosine similarity between the two word vectors + :rtype: float + + :Note: + * If a word in `word1` or `word2` is not in the vocabulary, + :class:`KeyError` will be raised. + + :Example: + + Compute consine similarity between two words: "รถไฟ" and "รถไฟฟ้า" + (train and electric train). + + >>> from pythainlp.word_vector import WordVector + >>> wv = WordVector() + >>> wv.similarity('รถไฟ', 'รถไฟฟ้า') + 0.43387136 + + + Compute consine similarity between two words: "เสือดาว" and "รถไฟฟ้า" + (leopard and electric train). + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> wv.similarity('เสือดาว', 'รถไฟฟ้า') + 0.04300258 + + """ + return self.model.similarity(word1, word2)
+ + +
+[docs] + def sentence_vectorizer(self, text: str, use_mean: bool = True) -> ndarray: + """ + This function converts a Thai sentence into vector. + Specifically, it first tokenizes that text and map each tokenized word + with the word vectors from the model. + Then, word vectors are aggregated into one vector of 300 dimension + by calculating either mean or summation of all word vectors. + + :param str text: text input + :param bool use_mean: if `True` aggregate word vectors with mean of all + word vectors. Otherwise, aggregate with + summation of all word vectors + + :return: 300-dimension vector representing the given sentence + in form of :mod:`numpy` array + :rtype: :class:`numpy.ndarray((1,300))` + + + :Example: + + Vectorize the sentence, "อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735", + into one sentence vector with two aggregation methods: mean + and summation. + + >>> from pythainlp.word_vector import WordVector + >>> + >>> wv = WordVector() + >>> sentence = 'อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735' + >>> wv.sentence_vectorizer(sentence, use_mean=True) + array([[-0.00421414, -0.08881307, 0.05081136, -0.05632929, + -0.06607185, 0.03059357, -0.113882 , -0.00074836, 0.05035743, + 0.02914307, + ... + 0.02893357, 0.11327957, 0.04562086, -0.05015393, 0.11641257, + 0.32304936, -0.05054322, 0.03639471, -0.06531371, 0.05048079]]) + >>> + >>> wv.sentence_vectorizer(sentence, use_mean=False) + array([[-0.05899798, -1.24338295, 0.711359 , -0.78861002, + -0.92500597, 0.42831 , -1.59434797, -0.01047703, 0.705004 + , 0.40800299, + ... + 0.40506999, 1.58591403, 0.63869202, -0.702155 , 1.62977601, + 4.52269109, -0.70760502, 0.50952601, -0.914392 , 0.70673105]]) + """ + vec = zeros((1, self.WV_DIM)) + + words = self.tokenize(text) + len_words = len(words) + + if not len_words: + return vec + + for word in words: + if word == " " and self.model_name == "thai2fit_wv": + word = _TK_SP + elif word == "\n" and self.model_name == "thai2fit_wv": + word = _TK_EOL + + if word in self.model.index_to_key: + vec += self.model.get_vector(word) + + if use_mean: + vec /= len_words + + return vec
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_modules/pythainlp/wsd/core.html b/5.1/_modules/pythainlp/wsd/core.html new file mode 100644 index 0000000..c21f3e9 --- /dev/null +++ b/5.1/_modules/pythainlp/wsd/core.html @@ -0,0 +1,270 @@ + + + + + + + + pythainlp.wsd.core — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for pythainlp.wsd.core

+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple, Union
+
+from pythainlp.corpus import thai_wsd_dict
+from pythainlp.tokenize import Tokenizer
+from pythainlp.util.trie import Trie
+
+_wsd_dict = thai_wsd_dict()
+_mean_all = {}
+
+for i, j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
+    _mean_all[i] = j
+
+_all_word = set(list(_mean_all.keys()))
+_TRIE = Trie(list(_all_word))
+_word_cut = Tokenizer(custom_dict=_TRIE)
+
+_MODEL = None
+
+
+class _SentenceTransformersModel:
+    def __init__(
+        self,
+        model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+        device: str = "cpu",
+    ):
+        from sentence_transformers import SentenceTransformer
+
+        self.device = device
+        self.model_name = model
+        self.model = SentenceTransformer(self.model_name, device=self.device)
+
+    def change_device(self, device: str):
+        from sentence_transformers import SentenceTransformer
+
+        self.device = device
+        self.model = SentenceTransformer(self.model_name, device=self.device)
+
+    def get_score(self, sentences1: str, sentences2: str) -> float:
+        from sentence_transformers import util
+
+        embedding_1 = self.model.encode(sentences1, convert_to_tensor=True)
+        embedding_2 = self.model.encode(sentences2, convert_to_tensor=True)
+        return 1 - util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item()
+
+
+
+[docs] +def get_sense( + sentence: str, + word: str, + device: str = "cpu", + custom_dict: dict = dict(), + custom_tokenizer: Tokenizer = _word_cut, +) -> List[Tuple[str, float]]: + """ + Get word sense from the sentence. + This function will get definition and distance from context in sentence. + + :param str sentence: Thai sentence + :param str word: Thai word + :param str device: device for running model on. + :param dict custom_dict: Thai dictionary {"word":["definition",..]} + :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in \ + sentence. + :return: a list of definitions and distances (1 - cos_sim) or \ + an empty list (if word is not in the dictionary) + :rtype: List[Tuple[str, float]] + + We get the ideas from `Context-Aware Semantic Similarity Measurement for \ + Unsupervised Word Sense Disambiguation \ + <https://arxiv.org/abs/2305.03520>`_ to build get_sense function. + + Use Thai dictionary from wiktionary. + See `thai_dict <https://pythainlp.org/pythainlp-corpus/thai_dict.html>`_. + + Use sentence transformers model from \ + `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 \ + <https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2>`_ \ + for unsupervised word sense disambiguation. + + :Example: + :: + + from pythainlp.wsd import get_sense + print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.0974416732788086), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.09319090843200684)] + + print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้")) + # output: + # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', + # 0.1005704402923584), + # ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', + # 0.12473666667938232)] + """ + global _MODEL + if not custom_dict: + custom_dict = _mean_all + + w = custom_tokenizer.word_tokenize(sentence) + if word not in set(custom_dict.keys()) or word not in sentence: + return [] + + if not _MODEL: + _MODEL = _SentenceTransformersModel(device=device) + if _MODEL.device != device: + _MODEL.change_device(device=device) + + temp_mean = custom_dict[word] + temp = [] + for i in temp_mean: + _temp_2 = [] + for j in w: + if j == word: + j = ( + word + + f" ({word} ความหมาย '" + + i.replace("(", "").replace(")", "") + + "') " + ) + _temp_2.append(j) + temp.append((i, _MODEL.get_score(sentence, "".join(_temp_2)))) + + return temp
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/_sources/api/.ipynb_checkpoints/augment-checkpoint.rst.txt b/5.1/_sources/api/.ipynb_checkpoints/augment-checkpoint.rst.txt new file mode 100644 index 0000000..c41be8b --- /dev/null +++ b/5.1/_sources/api/.ipynb_checkpoints/augment-checkpoint.rst.txt @@ -0,0 +1,62 @@ +.. currentmodule:: pythainlp.augment + +pythainlp.augment +================= + +Introduction +------------ + +The `pythainlp.augment` module is a powerful toolset for text augmentation in the Thai language. Text augmentation is a process that enriches and diversifies textual data by generating alternative versions of the original text. This module is a valuable resource for improving the quality and variety of Thai language data for NLP tasks. + + +WordNetAug Class +---------------- + +The `WordNetAug` class is designed to perform text augmentation using WordNet, a lexical database for English. This class enables you to augment Thai text using English synonyms, offering a unique approach to text diversification. The following methods are available within this class: + +.. autoclass:: pythainlp.augment.WordNetAug + :members: + +Word2VecAug, Thai2fitAug, LTW2VAug Classes +------------------------------------------ + +The `pythainlp.augment.word2vec` package contains multiple classes for text augmentation using Word2Vec models. These classes include `Word2VecAug`, `Thai2fitAug`, and `LTW2VAug`. Each of these classes allows you to use Word2Vec embeddings to generate text variations. Explore the methods provided by these classes to understand their capabilities. + +.. autoclass:: pythainlp.augment.word2vec.Word2VecAug + :members: + +.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug + :members: + +.. autoclass:: pythainlp.augment.word2vec.LTW2VAug + :members: + +FastTextAug and Thai2transformersAug Classes +-------------------------------------------- + +The `pythainlp.augment.lm` package offers classes for text augmentation using language models. These classes include `FastTextAug` and `Thai2transformersAug`. These classes allow you to use language model-based techniques to diversify text data. Explore their methods to understand their capabilities. + +.. autoclass:: pythainlp.augment.lm.FastTextAug + :members: + +.. autoclass:: pythainlp.augment.lm.Thai2transformersAug + :members: + +BPEmbAug Class +-------------- + +The `pythainlp.augment.word2vec.bpemb_wv` package contains the `BPEmbAug` class, which is designed for text augmentation using subword embeddings. This class is particularly useful when working with subword representations for Thai text augmentation. + +.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug + :members: + +Additional Functions +------------------- + +To further enhance your text augmentation tasks, the `pythainlp.augment` module offers the following functions: + +- `postype2wordnet`: This function maps part-of-speech tags to WordNet-compatible POS tags, facilitating the integration of WordNet augmentation with Thai text. + +These functions and classes provide diverse techniques for text augmentation in the Thai language, making this module a valuable asset for NLP researchers, developers, and practitioners. + +For detailed usage examples and guidelines, please refer to the official PyThaiNLP documentation. The `pythainlp.augment` module opens up new possibilities for enriching and diversifying Thai text data, leading to improved NLP models and applications. diff --git a/5.1/_sources/api/.ipynb_checkpoints/transliterate-checkpoint.rst.txt b/5.1/_sources/api/.ipynb_checkpoints/transliterate-checkpoint.rst.txt new file mode 100644 index 0000000..6222e9c --- /dev/null +++ b/5.1/_sources/api/.ipynb_checkpoints/transliterate-checkpoint.rst.txt @@ -0,0 +1,67 @@ +.. currentmodule:: pythainlp.transliterate + +pythainlp.transliterate +======================= +The :mod:`pythainlp.transliterate` module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks. + +Modules +------- + +.. autofunction:: romanize + :noindex: + + The `romanize` function allows you to transliterate Thai text, converting it into a phonetic representation using the English alphabet. It's a fundamental tool for rendering Thai words and phrases in a more familiar format. + +.. autofunction:: transliterate + :noindex: + + The `transliterate` function serves as a versatile transliteration tool, offering a range of transliteration engines to choose from. It provides flexibility and customization for your transliteration needs. + +.. autofunction:: pronunciate + :noindex: + + This function provides assistance in generating phonetic representations of Thai words, which is particularly useful for language learning and pronunciation practice. + +.. autofunction:: puan + :noindex: + + The `puan` function offers a unique transliteration feature known as "Puan." It provides a specialized transliteration method for Thai text and is an additional option for rendering Thai text into English characters. + +.. autoclass:: pythainlp.transliterate.wunsen.WunsenTransliterate + :members: + + The `WunsenTransliterate` class represents a transliteration engine known as "Wunsen." It offers specific transliteration methods for rendering Thai text into a phonetic English format. + +Transliteration Engines +----------------------- + +**thai2rom** + +.. autofunction:: pythainlp.transliterate.thai2rom.romanize + + The `thai2rom` engine specializes in transliterating Thai text into romanized form. It's particularly useful for rendering Thai words accurately in an English phonetic format. + +**royin** + +.. autofunction:: pythainlp.transliterate.royin.romanize + + The `royin` engine focuses on transliterating Thai text into English characters. It provides an alternative approach to transliteration, ensuring accurate representation of Thai words. + +**Transliterate Engines** + +This section includes multiple transliteration engines designed to suit various use cases. They offer unique methods for transliterating Thai text into romanized form: + +- **icu**: Utilizes the ICU transliteration system for phonetic conversion. +- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text. +- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. +- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0 +- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration. +- **iso_11940**: Focuses on the ISO 11940 transliteration standard. + +References +---------- + +.. [#rtgs_transcription] Nitaya Kanchanawan. (2006). `Romanization, Transliteration, and Transcription for the Globalization of the Thai Language. `_ + The Journal of the Royal Institute of Thailand. + +The `pythainlp.transliterate` module offers a comprehensive set of tools and engines for transliterating Thai text into Romanized form. Whether you need a simple transliteration, specific engines for accurate representation, or phonetic rendering, this module provides a wide range of options. Additionally, the module references a publication that highlights the significance of Romanization, Transliteration, and Transcription in making the Thai language accessible to a global audience. diff --git a/5.1/_sources/api/.ipynb_checkpoints/word_vector-checkpoint.rst.txt b/5.1/_sources/api/.ipynb_checkpoints/word_vector-checkpoint.rst.txt new file mode 100644 index 0000000..107328e --- /dev/null +++ b/5.1/_sources/api/.ipynb_checkpoints/word_vector-checkpoint.rst.txt @@ -0,0 +1,28 @@ +.. currentmodule:: pythainlp.word_vector + +pythainlp.word_vector +======================= +The :class:`word_vector` contains functions that makes use of a pre-trained vector public data. +The `pythainlp.word_vector` module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more. + +Dependencies +------------ +Installation of :mod:`numpy` and :mod:`gensim` is required. + +Before using this module, you need to ensure that the `numpy` and `gensim` libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors. + +Modules +------- + +.. autoclass:: WordVector + :members: + + The `WordVector` class encapsulates word vector operations and functions. It provides a convenient interface for loading models, finding word similarities, and generating sentence vectors. + +References +---------- + +- [Omer Levy and Yoav Goldberg (2014). Linguistic Regularities in Sparse and Explicit Word Representations](https://www.aclweb.org/anthology/W14-1618/) + This reference points to the work by Omer Levy and Yoav Goldberg, which discusses linguistic regularities in word representations. It underlines the theoretical foundation of word vectors and their applications in NLP. + +This enhanced documentation provides a more detailed and organized overview of the `pythainlp.word_vector` module, making it a valuable resource for NLP practitioners and researchers working with pre-trained word vectors in the Thai language. diff --git a/5.1/_sources/api/ancient.rst.txt b/5.1/_sources/api/ancient.rst.txt new file mode 100644 index 0000000..5e498f4 --- /dev/null +++ b/5.1/_sources/api/ancient.rst.txt @@ -0,0 +1,9 @@ +.. currentmodule:: pythainlp.ancient + +pythainlp.ancient +================= + +Modules +------- + +.. autofunction:: aksonhan_to_current \ No newline at end of file diff --git a/5.1/_sources/api/augment.rst.txt b/5.1/_sources/api/augment.rst.txt new file mode 100644 index 0000000..c41be8b --- /dev/null +++ b/5.1/_sources/api/augment.rst.txt @@ -0,0 +1,62 @@ +.. currentmodule:: pythainlp.augment + +pythainlp.augment +================= + +Introduction +------------ + +The `pythainlp.augment` module is a powerful toolset for text augmentation in the Thai language. Text augmentation is a process that enriches and diversifies textual data by generating alternative versions of the original text. This module is a valuable resource for improving the quality and variety of Thai language data for NLP tasks. + + +WordNetAug Class +---------------- + +The `WordNetAug` class is designed to perform text augmentation using WordNet, a lexical database for English. This class enables you to augment Thai text using English synonyms, offering a unique approach to text diversification. The following methods are available within this class: + +.. autoclass:: pythainlp.augment.WordNetAug + :members: + +Word2VecAug, Thai2fitAug, LTW2VAug Classes +------------------------------------------ + +The `pythainlp.augment.word2vec` package contains multiple classes for text augmentation using Word2Vec models. These classes include `Word2VecAug`, `Thai2fitAug`, and `LTW2VAug`. Each of these classes allows you to use Word2Vec embeddings to generate text variations. Explore the methods provided by these classes to understand their capabilities. + +.. autoclass:: pythainlp.augment.word2vec.Word2VecAug + :members: + +.. autoclass:: pythainlp.augment.word2vec.Thai2fitAug + :members: + +.. autoclass:: pythainlp.augment.word2vec.LTW2VAug + :members: + +FastTextAug and Thai2transformersAug Classes +-------------------------------------------- + +The `pythainlp.augment.lm` package offers classes for text augmentation using language models. These classes include `FastTextAug` and `Thai2transformersAug`. These classes allow you to use language model-based techniques to diversify text data. Explore their methods to understand their capabilities. + +.. autoclass:: pythainlp.augment.lm.FastTextAug + :members: + +.. autoclass:: pythainlp.augment.lm.Thai2transformersAug + :members: + +BPEmbAug Class +-------------- + +The `pythainlp.augment.word2vec.bpemb_wv` package contains the `BPEmbAug` class, which is designed for text augmentation using subword embeddings. This class is particularly useful when working with subword representations for Thai text augmentation. + +.. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug + :members: + +Additional Functions +------------------- + +To further enhance your text augmentation tasks, the `pythainlp.augment` module offers the following functions: + +- `postype2wordnet`: This function maps part-of-speech tags to WordNet-compatible POS tags, facilitating the integration of WordNet augmentation with Thai text. + +These functions and classes provide diverse techniques for text augmentation in the Thai language, making this module a valuable asset for NLP researchers, developers, and practitioners. + +For detailed usage examples and guidelines, please refer to the official PyThaiNLP documentation. The `pythainlp.augment` module opens up new possibilities for enriching and diversifying Thai text data, leading to improved NLP models and applications. diff --git a/5.1/_sources/api/benchmarks.rst.txt b/5.1/_sources/api/benchmarks.rst.txt new file mode 100644 index 0000000..53d0aa8 --- /dev/null +++ b/5.1/_sources/api/benchmarks.rst.txt @@ -0,0 +1,44 @@ +.. currentmodule:: pythainlp.benchmarks + +pythainlp.benchmarks +==================== + +Introduction +------------ + +The `pythainlp.benchmarks` module is a collection of utility functions designed for benchmarking tasks related to Thai Natural Language Processing (NLP). Currently, the module includes tools for word tokenization benchmarking. Please note that additional benchmarking tasks will be incorporated in the future. + +Tokenization +------------ + +Word tokenization is a fundamental task in NLP, and it plays a crucial role in various applications, such as text analysis and language processing. The `pythainlp.benchmarks` module offers a set of functions to assist in the benchmarking and evaluation of word tokenization methods. + +Quality Evaluation +^^^^^^^^^^^^^^^^^^ + +The quality of word tokenization can significantly impact the accuracy of downstream NLP tasks. To assess the quality of word tokenization, the module provides a qualitative evaluation using various metrics and techniques. + +.. figure:: ../images/evaluation.png + :scale: 50 % + + Qualitative evaluation of word tokenization. + +Functions +--------- + +.. autofunction:: pythainlp.benchmarks.word_tokenization.compute_stats + + This function is used to compute various statistics and metrics related to word tokenization. It allows you to assess the performance of different tokenization methods. + +.. autofunction:: pythainlp.benchmarks.word_tokenization.benchmark + + The `benchmark` function facilitates the benchmarking of word tokenization methods. It provides an organized framework for evaluating and comparing the effectiveness of different tokenization tools. + +.. autofunction:: pythainlp.benchmarks.word_tokenization.preprocessing + + Preprocessing is a crucial step in NLP tasks. The `preprocessing` function assists in preparing text data for tokenization, which is essential for accurate and consistent benchmarking. + +Usage +----- + +To make use of these benchmarking functions, you can follow the provided examples and guidelines in the official PyThaiNLP documentation. These tools are invaluable for researchers, developers, and anyone interested in improving and evaluating Thai word tokenization methods. diff --git a/5.1/_sources/api/chat.rst.txt b/5.1/_sources/api/chat.rst.txt new file mode 100644 index 0000000..3d0d9d2 --- /dev/null +++ b/5.1/_sources/api/chat.rst.txt @@ -0,0 +1,7 @@ +.. currentmodule:: pythainlp.chat + +pythainlp.chat +============== + +.. autoclass:: ChatBotModel + :members: \ No newline at end of file diff --git a/5.1/_sources/api/classify.rst.txt b/5.1/_sources/api/classify.rst.txt new file mode 100644 index 0000000..f6ad689 --- /dev/null +++ b/5.1/_sources/api/classify.rst.txt @@ -0,0 +1,7 @@ +.. currentmodule:: pythainlp.classify + +pythainlp.classify +============= + +.. autoclass:: GzipModel + :members: diff --git a/5.1/_sources/api/coref.rst.txt b/5.1/_sources/api/coref.rst.txt new file mode 100644 index 0000000..ffffc1b --- /dev/null +++ b/5.1/_sources/api/coref.rst.txt @@ -0,0 +1,38 @@ +.. currentmodule:: pythainlp.coref + +pythainlp.coref +=============== +Introduction +------------ + +The `pythainlp.coref` module is dedicated to Coreference Resolution for the Thai language. Coreference resolution is a crucial task in natural language processing (NLP) that deals with identifying and linking expressions (such as pronouns) in a text to the entities or concepts they refer to. This module provides tools to tackle coreference resolution challenges in the context of the Thai language. + +Coreference Resolution Function +------------------------------- + +The primary component of the `pythainlp.coref` module is the `coreference_resolution` function. This function is designed to analyze text and identify instances of coreference, helping NLP systems understand when different expressions in the text refer to the same entity. Here's how you can use it: + +The :class:`pythainlp.coref` is Coreference Resolution for Thai. + +.. autofunction:: coreference_resolution + +Usage +----- + +To use the `coreference_resolution` function effectively, follow these steps: + +1. Import the `coreference_resolution` function from the `pythainlp.coref` module. + +2. Pass the Thai text you want to analyze for coreferences as input to the function. + +3. The function will process the text and return information about coreference relationships within the text. + +Example: + +:: + from pythainlp.coref import coreference_resolution + + text = "นาย A มาจาก กรุงเทพ และเขา มีความรักต่อ บางกิจ ของเขา" + coreferences = coreference_resolution(text) + + print(coreferences) diff --git a/5.1/_sources/api/corpus.rst.txt b/5.1/_sources/api/corpus.rst.txt new file mode 100644 index 0000000..8f5a981 --- /dev/null +++ b/5.1/_sources/api/corpus.rst.txt @@ -0,0 +1,291 @@ +.. currentmodule:: pythainlp.corpus + +pythainlp.corpus +================ +The :class:`pythainlp.corpus` module provides access to various Thai language corpora and resources that come bundled with PyThaiNLP. These resources are essential for natural language processing tasks in the Thai language. + +Modules +------- + +countries +~~~~~~~~~~ +.. autofunction:: countries + :noindex: + +find_synonym +~~~~~~~~~~~~ +.. autofunction:: find_synonym + :noindex: + +get_corpus +~~~~~~~~~~ +.. autofunction:: get_corpus + :noindex: + +get_corpus_as_is +~~~~~~~~~~ +.. autofunction:: get_corpus_as_is + :noindex: + +get_corpus_db +~~~~~~~~~~~~~~ +.. autofunction:: get_corpus_db + :noindex: + +get_corpus_db_detail +~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: get_corpus_db_detail + :noindex: + +get_corpus_default_db +~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: get_corpus_default_db + :noindex: + +get_corpus_path +~~~~~~~~~~~~~~ +.. autofunction:: get_corpus_path + :noindex: + +download +~~~~~~~~~~ +.. autofunction:: download + :noindex: + +remove +~~~~~~~ +.. autofunction:: remove + :noindex: + +provinces +~~~~~~~~~~ +.. autofunction:: provinces + :noindex: + +thai_dict +~~~~~~~~~~ +.. autofunction:: thai_dict + :noindex: + +thai_stopwords +~~~~~~~~~~~~~~ +.. autofunction:: thai_stopwords + :noindex: + +thai_words +~~~~~~~~~~ +.. autofunction:: thai_words + :noindex: + +thai_wsd_dict +~~~~~~~~~~~~~~ +.. autofunction:: thai_wsd_dict + :noindex: + +thai_orst_words +~~~~~~~~~~~~~~~~~ +.. autofunction:: thai_orst_words + :noindex: + +thai_synonyms +~~~~~~~~~~~~~~ +.. autofunction:: thai_synonyms + :noindex: + +thai_syllables +~~~~~~~~~~~~~~ +.. autofunction:: thai_syllables + :noindex: + +thai_negations +~~~~~~~~~~~~~~ +.. autofunction:: thai_negations + :noindex: + +thai_family_names +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: thai_family_names + :noindex: + +thai_female_names +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: thai_female_names + :noindex: + +thai_male_names +~~~~~~~~~~~~~~~~ +.. autofunction:: thai_male_names + :noindex: + +pythainlp.corpus.th_en_translit.get_transliteration_dict +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.th_en_translit.get_transliteration_dict + :noindex: + +ConceptNet +---------- + +ConceptNet is an open, multilingual knowledge graph used for various natural language understanding tasks. For more information, refer to the `ConceptNet documentation `_. + +pythainlp.corpus.conceptnet.edges +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.conceptnet.edges + :noindex: + +TNC (Thai National Corpus) +--- + +The Thai National Corpus (TNC) is a collection of text data in the Thai language. This module provides access to word frequency data from the TNC corpus. + +pythainlp.corpus.tnc.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.tnc.word_freqs + :noindex: + +pythainlp.corpus.tnc.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs + :noindex: + +pythainlp.corpus.tnc.bigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs + :noindex: + +pythainlp.corpus.tnc.trigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs + :noindex: + +TTC (Thai Textbook Corpus) +--- + +The Thai Textbook Corpus (TTC) is a collection of Thai language text data, primarily sourced from textbooks. + +pythainlp.corpus.ttc.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.ttc.word_freqs + :noindex: + +pythainlp.corpus.ttc.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs + :noindex: + +OSCAR +----- + +OSCAR is a multilingual corpus that includes Thai text data. This module provides access to word frequency data from the OSCAR corpus. + +pythainlp.corpus.oscar.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.oscar.word_freqs + :noindex: + +pythainlp.corpus.oscar.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs + :noindex: + +Util +---- + +Utilities for working with the corpus data. + +pythainlp.corpus.util.find_badwords +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.util.find_badwords + :noindex: + +pythainlp.corpus.util.revise_wordset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.util.revise_wordset + :noindex: + +pythainlp.corpus.util.revise_newmm_default_wordset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.util.revise_newmm_default_wordset + :noindex: + +WordNet +------- + +PyThaiNLP API includes the WordNet module, which is an exact copy of NLTK's WordNet API for the Thai language. WordNet is a lexical database for English and other languages. + +For more details on WordNet, refer to the `NLTK WordNet documentation `_. + +pythainlp.corpus.wordnet.synsets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.synsets + :noindex: + +pythainlp.corpus.wordnet.synset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.synset + :noindex: + +pythainlp.corpus.wordnet.all_lemma_names +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.all_lemma_names + :noindex: + +pythainlp.corpus.wordnet.all_synsets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.all_synsets + :noindex: + +pythainlp.corpus.wordnet.langs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.langs + :noindex: + +pythainlp.corpus.wordnet.lemmas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.lemmas + :noindex: + +pythainlp.corpus.wordnet.lemma +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.lemma + :noindex: + +pythainlp.corpus.wordnet.lemma_from_key +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.lemma_from_key + :noindex: + +pythainlp.corpus.wordnet.path_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.path_similarity + :noindex: + +pythainlp.corpus.wordnet.lch_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.lch_similarity + :noindex: + +pythainlp.corpus.wordnet.wup_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.wup_similarity + :noindex: + +pythainlp.corpus.wordnet.morphy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.morphy + :noindex: + +pythainlp.corpus.wordnet.custom_lemmas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.corpus.wordnet.custom_lemmas + :noindex: + +Definition +++++++++++ + +Synset +~~~~~~~ +A synset is a set of synonyms that share a common meaning. The WordNet module provides functionality to work with these synsets. + +This documentation is designed to help you navigate and use the various resources and modules available in the `pythainlp.corpus` package effectively. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. + +We hope you find this documentation helpful for your natural language processing tasks in the Thai language. diff --git a/5.1/_sources/api/el.rst.txt b/5.1/_sources/api/el.rst.txt new file mode 100644 index 0000000..e38ab42 --- /dev/null +++ b/5.1/_sources/api/el.rst.txt @@ -0,0 +1,54 @@ +.. currentmodule:: pythainlp.el + +pythainlp.el +============ +The :class:`pythainlp.el` module is an essential component of Thai Entity Linking within the PyThaiNLP library. Entity Linking is a key natural language processing task that associates mentions in text with corresponding entities in a knowledge base. + +.. autoclass:: EntityLinker + :members: + +EntityLinker +------------ + +The :class:`EntityLinker` class is the core component of the `pythainlp.el` module, responsible for Thai Entity Linking. Entity Linking, also known as Named Entity Linking (NEL), plays a critical role in various applications, including question answering, information retrieval, and knowledge graph construction. + +.. Attributes and Methods +.. ~~~~~~~~~~~~~~~~~~~~~~ + +.. The `EntityLinker` class offers the following attributes and methods: + +.. - `__init__(text, engine="default")` +.. - The constructor for the `EntityLinker` class. It takes the input `text` and an optional `engine` parameter to specify the entity linking engine. The default engine is used if no specific engine is provided. + +.. - `link()` +.. - The `link` method performs entity linking on the input text using the specified engine. It returns a list of entities linked in the text, along with their relevant information. + +.. - `set_engine(engine)` +.. - The `set_engine` method allows you to change the entity linking engine during runtime. This provides flexibility in selecting different engines for entity linking based on your specific requirements. + +.. - `get_linked_entities()` +.. - The `get_linked_entities` method retrieves a list of linked entities from the last entity linking operation. This is useful for extracting the entities found in the text. + +.. Usage +.. ~~~~~ + +.. To use the `EntityLinker` class for entity linking, follow these steps: + +.. 1. Initialize an `EntityLinker` object with the input text and, optionally, specify the engine. + +.. 2. Call the `link` method to perform entity linking on the text. + +.. 3. Utilize the `get_linked_entities` method to access the linked entities found in the text. + +Example +~~~~~~~ + +Here's a simple example of how to use the `EntityLinker` class: + +:: + from pythainlp.el import EntityLinker + + text = "กรุงเทพเป็นเมืองหลวงของประเทศไทย" + el = EntityLinker() + linked_entities = el.get_el(text) + print(linked_entities) diff --git a/5.1/_sources/api/generate.rst.txt b/5.1/_sources/api/generate.rst.txt new file mode 100644 index 0000000..c6c1839 --- /dev/null +++ b/5.1/_sources/api/generate.rst.txt @@ -0,0 +1,72 @@ +.. currentmodule:: pythainlp.generate + +pythainlp.generate +================== +The :class:`pythainlp.generate` module is a powerful tool for generating Thai text using PyThaiNLP. It includes several classes and functions that enable users to create text based on various language models and n-gram models. + +Modules +------- + +Unigram +~~~~~~~ +.. autoclass:: Unigram + :members: + +The :class:`Unigram` class provides functionality for generating text based on unigram language models. Unigrams are single words or tokens, and this class allows you to create text by selecting words probabilistically based on their frequencies in the training data. + +Bigram +~~~~~~ +.. autoclass:: Bigram + :members: + +The :class:`Bigram` class is designed for generating text using bigram language models. Bigrams are sequences of two words, and this class enables you to generate text by predicting the next word based on the previous word's probability. + +Trigram +~~~~~~~ +.. autoclass:: Trigram + :members: + +The :class:`Trigram` class extends text generation to trigram language models. Trigrams consist of three consecutive words, and this class facilitates the creation of text by predicting the next word based on the two preceding words' probabilities. + +pythainlp.generate.thai2fit.gen_sentence +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.generate.thai2fit.gen_sentence + :noindex: + +The function :func:`pythainlp.generate.thai2fit.gen_sentence` offers a convenient way to generate sentences using the Thai2Vec language model. It takes a seed text as input and generates a coherent sentence based on the provided context. + +pythainlp.generate.wangchanglm.WangChanGLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: pythainlp.generate.wangchanglm.WangChanGLM + :members: + +The :class:`WangChanGLM` class is a part of the `pythainlp.generate.wangchanglm` module, offering text generation capabilities. It includes methods for creating text using the WangChanGLM language model. + +Usage +~~~~~ + +To use the text generation capabilities provided by the `pythainlp.generate` module, follow these steps: + +1. Select the appropriate class or function based on the type of language model you want to use (Unigram, Bigram, Trigram, Thai2Vec, or WangChanGLM). + +2. Initialize the selected class or use the function with the necessary parameters. + +3. Call the appropriate methods to generate text based on the chosen model. + +4. Utilize the generated text for various applications, such as chatbots, content generation, and more. + +Example +~~~~~~~ + +Here's a simple example of how to generate text using the `Unigram` class: + +:: + from pythainlp.generate import Unigram + + # Initialize the Unigram model + unigram = Unigram() + + # Generate a sentence + sentence = unigram.gen_sentence("สวัสดีครับ") + + print(sentence) diff --git a/5.1/_sources/api/khavee.rst.txt b/5.1/_sources/api/khavee.rst.txt new file mode 100644 index 0000000..ea6dcf1 --- /dev/null +++ b/5.1/_sources/api/khavee.rst.txt @@ -0,0 +1,64 @@ +.. currentmodule:: pythainlp.khavee + +pythainlp.khavee +================ +The :class:`pythainlp.khavee` module is a powerful toolkit designed for working with Thai poetry. The term "khavee" corresponds to "กวี" in the Thai language, which translates to "Poetry" in English. This toolkit equips users with the tools and utilities necessary for the creation, analysis, and verification of Thai poetry. + +Modules +------- + +KhaveeVerifier +~~~~~~~~~~~~~~ +.. autoclass:: KhaveeVerifier + :special-members: + :members: + +The :class:`KhaveeVerifier` class is the primary component of the `pythainlp.khavee` module, dedicated to the verification of Thai poetry. It offers a range of functions and methods for analyzing and validating Thai poetry, ensuring its adherence to the rules and structure of classical Thai poetic forms. + +.. Attributes and Methods +.. ~~~~~~~~~~~~~~~~~~~~~~ + +.. The `KhaveeVerifier` class provides a variety of attributes and methods to facilitate the verification of Thai poetry. Some of its key features include: + +.. - `__init__(rules: dict = None, stanza_rules: dict = None, verbose: bool = False)` +.. - The constructor for the `KhaveeVerifier` class, allowing you to initialize an instance with custom rules, stanza rules, and verbosity settings. + +.. - `is_khavee(text: str, rules: dict = None)` +.. - The `is_khavee` method checks whether a given text conforms to the rules of Thai poetry. It returns `True` if the text is a valid Thai poem according to the specified rules, and `False` otherwise. + +.. - `get_rules()` +.. - The `get_rules` method retrieves the current set of rules being used by the verifier. This is helpful for inspecting and modifying the rules during runtime. + +.. - `set_rules(rules: dict)` +.. - The `set_rules` method allows you to set custom rules for the verifier, offering flexibility in defining specific constraints for Thai poetry. + +.. Usage +.. ~~~~~ + +.. To use the `KhaveeVerifier` class for Thai poetry verification, follow these steps: + +.. 1. Initialize an instance of the `KhaveeVerifier` class, optionally specifying custom rules and verbosity settings. + +.. 2. Use the `is_khavee` method to verify whether a given text adheres to the rules of Thai poetry. The method returns a Boolean value indicating the result. + +.. 3. Utilize the `get_rules` and `set_rules` methods to inspect and modify the rules as needed. + +Example +~~~~~~~ + +Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry: + +:: + + from pythainlp.khavee import KhaveeVerifier + + # Initialize a KhaveeVerifier instance + verifier = KhaveeVerifier() + + # Text to verify + poem_text = "ดอกไม้สวยงาม แสนสดใส" + + # Verify if the text is Thai poetry + is_poetry = verifier.is_khavee(poem_text) + + print(f"The provided text is Thai poetry: {is_poetry}") diff --git a/5.1/_sources/api/lm.rst.txt b/5.1/_sources/api/lm.rst.txt new file mode 100644 index 0000000..063aecb --- /dev/null +++ b/5.1/_sources/api/lm.rst.txt @@ -0,0 +1,10 @@ +.. currentmodule:: pythainlp.lm + +pythainlp.lm +============ + +Modules +------- + +.. autofunction:: calculate_ngram_counts +.. autofunction:: remove_repeated_ngrams \ No newline at end of file diff --git a/5.1/_sources/api/morpheme.rst.txt b/5.1/_sources/api/morpheme.rst.txt new file mode 100644 index 0000000..b227cae --- /dev/null +++ b/5.1/_sources/api/morpheme.rst.txt @@ -0,0 +1,13 @@ +.. currentmodule:: pythainlp.morpheme + +pythainlp.morpheme +================== + +The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language. + +.. autofunction:: nighit + +.. autofunction:: is_native_thai + :noindex: + + The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks. diff --git a/5.1/_sources/api/parse.rst.txt b/5.1/_sources/api/parse.rst.txt new file mode 100644 index 0000000..15a2921 --- /dev/null +++ b/5.1/_sources/api/parse.rst.txt @@ -0,0 +1,41 @@ +.. currentmodule:: pythainlp.parse + +pythainlp.parse +=============== +The :class:`pythainlp.parse` module provides dependency parsing for the Thai language. Dependency parsing is a fundamental task in natural language processing that involves identifying the grammatical relationships between words in a sentence, which helps to analyze sentence structure and meaning. + +Modules +------- + +dependency_parsing +~~~~~~~~~~~~~~~~~ +.. autofunction:: dependency_parsing + +The `dependency_parsing` function is the core component of the `pythainlp.parse` module. It offers dependency parsing capabilities for the Thai language. Given a Thai sentence as input, this function parses the sentence to identify the grammatical relationships between words, creating a dependency tree that represents the sentence's structure. + +Usage +~~~~~ + +To use the `dependency_parsing` function for Thai dependency parsing, follow these steps: + +1. Import the `pythainlp.parse` module. +2. Use the `dependency_parsing` function with a Thai sentence as input. +3. The function will return the dependency parsing results, which include information about the grammatical relationships between words. + +Example +~~~~~~~ + +Here's a basic example of how to use the `dependency_parsing` function: + +:: + + from pythainlp.parse import dependency_parsing + + # Input Thai sentence + sentence = "พี่น้องชาวบ้านกำลังเลี้ยงสตางค์ในสวน" + + # Perform dependency parsing + parsing_result = dependency_parsing(sentence) + + # Print the parsing result + print(parsing_result) diff --git a/5.1/_sources/api/phayathaibert.rst.txt b/5.1/_sources/api/phayathaibert.rst.txt new file mode 100644 index 0000000..348ab5a --- /dev/null +++ b/5.1/_sources/api/phayathaibert.rst.txt @@ -0,0 +1,18 @@ +.. currentmodule:: pythainlp.phayathaibert + +pythainlp.phayathaibert +======================= +The `pythainlp.phayathaibert` module is built upon the phayathaibert base model. + +Modules +------- + +.. autoclass:: ThaiTextProcessor + :members: +.. autoclass:: ThaiTextAugmenter + :members: +.. autoclass:: PartOfSpeechTagger + :members: +.. autoclass:: NamedEntityTagger + :members: +.. autofunction:: segment \ No newline at end of file diff --git a/5.1/_sources/api/soundex.rst.txt b/5.1/_sources/api/soundex.rst.txt new file mode 100644 index 0000000..22e335f --- /dev/null +++ b/5.1/_sources/api/soundex.rst.txt @@ -0,0 +1,69 @@ +.. currentmodule:: pythainlp.soundex + +pythainlp.soundex +================= +The :class:`pythainlp.soundex` module provides soundex algorithms for the Thai language. Soundex is a phonetic algorithm used to encode words or names into a standardized representation based on their pronunciation, making it useful for tasks like name matching and search. + +Modules +------- + +soundex +~~~~~~~ +.. autofunction:: soundex + +The `soundex` function is a basic Soundex algorithm for the Thai language. It encodes a Thai word into a Soundex code, allowing for approximate matching of words with similar pronunciation. + +lk82 +~~~~ +.. autofunction:: lk82 + +The `lk82` module implements the Thai Soundex algorithm proposed by Vichit Lorchirachoonkul in 1982. This module is suitable for encoding Thai words into Soundex codes for phonetic comparisons. + +udom83 +~~~~~~ +.. autofunction:: udom83 + +The `udom83` module is based on a homonymic approach for sound-alike string search. It encodes Thai words using the Wannee Udompanich Soundex algorithm developed in 1983. + +metasound +~~~~~~~~~ +.. autofunction:: metasound + +The `metasound` module implements a novel phonetic name matching algorithm with a statistical ontology for analyzing names based on Thai astrology. It offers advanced phonetic matching capabilities for Thai names. + +prayut_and_somchaip +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: prayut_and_somchaip + +The `prayut_and_somchaip` module is designed for Thai-English cross-language transliterated word retrieval using the Soundex technique. It is particularly useful for matching transliterated words in both languages. + +pythainlp.soundex.sound.word_approximation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.soundex.sound.word_approximation + +The `pythainlp.soundex.sound.word_approximation` module offers word approximation functionality. It allows users to find Thai words that are phonetically similar to a given word. + +pythainlp.soundex.sound.audio_vector +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.soundex.sound.audio_vector + +The `pythainlp.soundex.sound.audio_vector` module provides audio vector functionality for Thai words. It allows users to work with audio vectors based on phonetic properties. + +pythainlp.soundex.sound.word2audio +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pythainlp.soundex.sound.word2audio + +The `pythainlp.soundex.sound.word2audio` module is designed for converting Thai words to audio representations. It enables users to obtain audio vectors for Thai words, which can be used for various applications. + +References +---------- + +.. [#metasound] Snae & Brückner. (2009). `Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analyzing Names Given in Accordance with Thai Astrology `_. + +.. [#udom83] Wannee Udompanich (1983). `String searching for Thai alphabet using Soundex compression technique `_. Master Thesis. Chulalongkorn University, Thailand. วรรณี อุดมพาณิชย์. `การใช้หลักคำพ้องเสียง เพื่อค้นหาชุดอักขระภาษาไทยที่ออกเสียงเหมือนกัน `_. วิทยานิพนธ์ (วท.ม.)--จุฬาลงกรณ์มหาวิทยาลัย, 2526. + +.. [#lk82] Vichit Lorchirachoonkul. 1982. `A Thai soundex system `_. Information Processing & Management, 18(5):243–255. วิชิต หล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์. `โปรแกรมการสืบค้นคำไทยตามเสียงอ่าน (Thai Soundex) `_. + +.. [#prayut_and_somchaip] Prayut Suwanvisat, Somchai Prasitjutrakul. Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf. + +.. This enhanced documentation provides clear descriptions of all the modules within the `pythainlp.soundex` module, including their purposes and functionalities. Users can now better understand how to leverage these soundex algorithms for various phonetic matching tasks in the Thai language. diff --git a/5.1/_sources/api/spell.rst.txt b/5.1/_sources/api/spell.rst.txt new file mode 100644 index 0000000..fa36e57 --- /dev/null +++ b/5.1/_sources/api/spell.rst.txt @@ -0,0 +1,52 @@ +.. currentmodule:: pythainlp.spell + +pythainlp.spell +=============== +The :class:`pythainlp.spell` module is a powerful tool for finding the closest correctly spelled word to a given text in the Thai language. It provides functionalities to correct spelling errors and enhance the accuracy of text processing. + +Modules +------- + +correct +~~~~~~~ +.. autofunction:: correct + +The `correct` function is designed to correct the spelling of a single Thai word. Given an input word, this function returns the closest correctly spelled word from the dictionary, making it valuable for spell-checking and text correction tasks. + +correct_sent +~~~~~~~~~~~~ +.. autofunction:: correct_sent + +The `correct_sent` function is an extension of the `correct` function and is used to correct an entire sentence. It tokenizes the input sentence, corrects each word, and returns the corrected sentence. This is beneficial for proofreading and improving the readability of Thai text. + +spell +~~~~~ +.. autofunction:: spell + +The `spell` function is responsible for identifying spelling errors within a given Thai word. It checks whether the input word is spelled correctly or not and returns a Boolean result. This function is useful for validating the correctness of Thai words. + +spell_sent +~~~~~~~~~~ +.. autofunction:: spell_sent + +The `spell_sent` function extends the spell-checking functionality to entire sentences. It tokenizes the input sentence and checks the spelling of each word. It returns a list of Booleans indicating whether each word in the sentence is spelled correctly or not. + +NorvigSpellChecker +~~~~~~~~~~~~~~~~~~ +.. autoclass:: NorvigSpellChecker + :special-members: + :members: + +The `NorvigSpellChecker` class is a fundamental component of the `pythainlp.spell` module. It implements a spell-checking algorithm based on the work of Peter Norvig. This class is designed for more advanced spell-checking and provides customizable settings for spell correction. + +DEFAULT_SPELL_CHECKER +~~~~~~~~~~~~~~~~~~~~~ +.. autodata:: DEFAULT_SPELL_CHECKER + :annotation: = Default instance of the standard NorvigSpellChecker, using word list data from the Thai National Corpus: http://www.arts.chula.ac.th/ling/tnc/ + +The `DEFAULT_SPELL_CHECKER` is an instance of the `NorvigSpellChecker` class with default settings. It is pre-configured to use word list data from the Thai National Corpus, making it a reliable choice for general spell-checking tasks. + +References +---------- + +.. [#norvig_spellchecker] Peter Norvig (2007). `How to Write a Spelling Corrector `_. diff --git a/5.1/_sources/api/summarize.rst.txt b/5.1/_sources/api/summarize.rst.txt new file mode 100644 index 0000000..42a5043 --- /dev/null +++ b/5.1/_sources/api/summarize.rst.txt @@ -0,0 +1,21 @@ +.. currentmodule:: pythainlp.summarize + +pythainlp.summarize +=================== +The :class:`summarize` is Thai text summarizer. + +Modules +------- + +.. autofunction:: summarize +.. autofunction:: extract_keywords + +Keyword Extraction Engines +-------------------------- + +KeyBERT ++++++++ + +.. automodule:: pythainlp.summarize.keybert +.. autoclass:: pythainlp.summarize.keybert.KeyBERT + :members: diff --git a/5.1/_sources/api/tag.rst.txt b/5.1/_sources/api/tag.rst.txt new file mode 100644 index 0000000..437de7a --- /dev/null +++ b/5.1/_sources/api/tag.rst.txt @@ -0,0 +1,263 @@ +.. currentmodule:: pythainlp.tag + +pythainlp.tag +============= +The :class:`pythainlp.tag` contains functions that are used to mark linguistic and other annotation to different parts of a text including +part-of-speech (POS) tags and named entity (NE) tags. + +For POS tags, there are three sets of available tags: `Universal POS tags `_, ORCHID POS tags [#Sornlertlamvanich_2000]_, and LST20 POS tags [#Prachya_2020]_. + +The following table shows Universal POS tags as used in Universal Dependencies (UD): + +============ ========================== ============================= +Abbreviation Part-of-Speech tag Examples +============ ========================== ============================= + ADJ Adjective ใหม่, พิเศษ , ก่อน, มาก, สูง + ADP Adposition แม้, ว่า, เมื่อ, ของ, สำหรับ + ADV Adverb ก่อน, ก็, เล็กน้อย, เลย, สุด + AUX Auxiliary เป็น, ใช่, คือ, คล้าย + CCONJ Coordinating conjunction แต่, และ, หรือ + DET Determiner ที่, นี้, ซึ่ง, ทั้ง, ทุก, หลาย + INTJ Interjection อุ้ย, โอ้ย + NOUN Noun กำมือ, พวก, สนาม, กีฬา, บัญชี + NUM Numeral 5,000, 103.7, 2004, หนึ่ง, ร้อย + PART Particle มา ขึ้น ไม่ ได้ เข้า + PRON Pronoun เรา, เขา, ตัวเอง, ใคร, เธอ + PROPN Proper noun โอบามา, แคปิตอลฮิล, จีโอพี, ไมเคิล + PUNCT Punctuation (, ), ", ', : + SCONJ Subordinating conjunction หาก + VERB Verb เปิด, ให้, ใช้, เผชิญ, อ่าน +============ ========================== ============================= + +The following table shows POS tags as used in ORCHID: + +============ ================================================= ================================= +Abbreviation Part-of-Speech tag Examples +============ ================================================= ================================= + NPRP Proper noun วินโดวส์ 95, โคโรน่า, โค้ก + NCNM Cardinal number หนึ่ง, สอง, สาม, 1, 2, 10 + NONM Ordinal number ที่หนึ่ง, ที่สอง, ที่สาม, ที่1, ที่2 + NLBL Label noun 1, 2, 3, 4, ก, ข, a, b + NCMN Common noun หนังสือ, อาหาร, อาคาร, คน + NTTL Title noun ครู, พลเอก + PPRS Personal pronoun คุณ, เขา, ฉัน + PDMN Demonstrative pronoun นี่, นั้น, ที่นั่น, ที่นี่ + PNTR Interrogative pronoun ใคร, อะไร, อย่างไร + PREL Relative pronoun ที่, ซึ่ง, อัน, ผู้ + VACT Active verb ทำงาน, ร้องเพลง, กิน + VSTA Stative verb เห็น, รู้, คือ + VATT Attributive verb อ้วน, ดี, สวย + XVBM Pre-verb auxiliary, before negator "ไม่" เกิด, เกือบ, กำลัง + XVAM Pre-verb auxiliary, after negator "ไม่" ค่อย, น่า, ได้ + XVMM Pre-verb, before or after negator "ไม่" ควร, เคย, ต้อง + XVBB Pre-verb auxiliary, in imperative mood กรุณา, จง, เชิญ, อย่า, ห้าม + XVAE Post-verb auxiliary ไป, มา, ขึ้น + DDAN | Definite determiner, after noun without ยี่, นั่น, โน่น, ทั้งหมด + | classifier in between + DDAC | Definite determiner, allowing classifier นี้, นั้น, โน้น, นู้น + | in between + DDBQ | Definite determiner, between noun and ทั้ง, อีก, เพียง + | classifier or preceding quantitative expression + DDAQ | Definite determiner, พอดี, ถ้วน + | following quantitative expression + DIAC | Indefinite determiner, following noun; allowing ไหน, อื่น, ต่างๆ + | classifier in between + DIBQ | Indefinite determiner, between noun and บาง, ประมาณ, เกือบ + | classifier or preceding quantitative expression + DIAQ | Indefinite determiner, กว่า, เศษ + | following quantitative expression + DCNM Determiner, cardinal number expression **หนึ่ง**\ คน, เสือ, **2** ตัว + DONM Determiner, ordinal number expression ที่หนึ่ง, ที่สอง, ที่สุดท้สย + ADVN Adverb with normal form เก่ง, เร็ว, ช้า, สม่ำเสมอ + ADVI Adverb with iterative form เร็วๆ, เสทอๆ, ช้าๆ + ADVP Adverb with prefixed form โดยเร็ว + ADVS Sentential adverb โดยปกติ, ธรรมดา + CNIT Unit classifier ตัว, คน, เล่ม + CLTV Collective classifier | คู่, กลุ่ม, ฝูง, เชิง, ทาง, + | ด้าน, แบบ, รุ่น + CMTR Measurement classifier กิโลกรัม, แก้ว, ชั่วโมง + CFQC Frequency classifier ครั้ง, เที่ยว + CVBL Verbal classifier ม้วน, มัด + JCRG Coordinating conjunction และ, หรือ, แต่ + JCMP Comparative conjunction กว่า, เหมือนกับ, เท่ากับ + JSBR Subordinating conjunction เพราะว่า, เนื่องจาก ที่, แม้ว่า, ถ้า + RPRE Preposition จาก, ละ, ของ, ใต้, บน + INT Interjection โอ้บ, โอ้, เออ, เอ๋, อ๋อ + FIXN Nominal prefix **การ**\ ทำงาน, **ความ**\ สนุนสนาน + FIXV Adverbial prefix **อย่าง**\ เร็ว + EAFF Ending for affirmative sentence จ๊ะ, จ้ะ, ค่ะ, ครับ, นะ, น่า, เถอะ + EITT Ending for interrogative sentence หรือ, เหรอ, ไหม, มั้ย + NEG Negator ไม่, มิได้, ไม่ได้, มิ + PUNC Punctuation (, ), “, ,, ; +============ ================================================= ================================= + +ORCHID corpus uses a different set of POS tags. Thus, we make UD POS tags version for ORCHID corpus. + +The following table shows the mapping of POS tags from ORCHID to UD: + +=============== ======================== +ORCHID POS tags Corresponding UD POS tag +=============== ======================== +NOUN NOUN +NCMN NOUN +NTTL NOUN +CNIT NOUN +CLTV NOUN +CMTR NOUN +CFQC NOUN +CVBL NOUN +VACT VERB +VSTA VERB +PROPN PROPN +NPRP PROPN +ADJ ADJ +NONM ADJ +VATT ADJ +DONM ADJ +ADV ADV +ADVN ADV +ADVI ADV +ADVP ADV +ADVS ADV +INT INTJ +PRON PRON +PPRS PRON +PDMN PRON +PNTR PRON +DET DET +DDAN DET +DDAC DET +DDBQ DET +DDAQ DET +DIAC DET +DIBQ DET +DIAQ DET +NUM NUM +NCNM NUM +NLBL NUM +DCNM NUM +AUX AUX +XVBM AUX +XVAM AUX +XVMM AUX +XVBB AUX +XVAE AUX +ADP ADP +RPRE ADP +CCONJ CCONJ +JCRG CCONJ +SCONJ SCONJ +PREL SCONJ +JSBR SCONJ +JCMP SCONJ +PART PART +FIXN PART +FIXV PART +EAFF PART +EITT PART +NEG PART +PUNCT PUNCT +PUNC PUNCT +=============== ======================= + +Details about LST20 POS tags are available in [#Prachya_2020]_. + +The following table shows the mapping of POS tags from LST20 to UD: + ++----------------+--------------------------+ +| LST20 POS tags | Corresponding UD POS tag | ++================+==========================+ +| AJ | ADJ | ++----------------+--------------------------+ +| AV | ADV | ++----------------+--------------------------+ +| AX | AUX | ++----------------+--------------------------+ +| CC | CCONJ | ++----------------+--------------------------+ +| CL | NOUN | ++----------------+--------------------------+ +| FX | NOUN | ++----------------+--------------------------+ +| IJ | INTJ | ++----------------+--------------------------+ +| NN | NOUN | ++----------------+--------------------------+ +| NU | NUM | ++----------------+--------------------------+ +| PA | PART | ++----------------+--------------------------+ +| PR | PROPN | ++----------------+--------------------------+ +| PS | ADP | ++----------------+--------------------------+ +| PU | PUNCT | ++----------------+--------------------------+ +| VV | VERB | ++----------------+--------------------------+ +| XX | X | ++----------------+--------------------------+ + +For the NE, we use `Inside-outside-beginning (IOB) `_ format to tag NE for each word. + +*B-* prefix indicates the beginning token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk. + +For instance, given a sentence "บารัค โอบามาเป็นประธานธิปดี", it would tag the tokens "บารัค", "โอบามา", "เป็น", "ประธานาธิปดี" with "B-PERSON", "I-PERSON", "O", and "O" respectively. + +The following table shows named entity (NE) tags as used in PyThaiNLP: + +============================ ================================= +Named Entity tag Examples +============================ ================================= + DATE 2/21/2004, 16 ก.พ., จันทร์ + TIME 16.30 น., 5 วัน, 1-3 ปี + EMAIL info@nrpsc.ac.th + LEN 30 กิโลเมตร, 5 กม. + LOCATION ไทย, จ.ปราจีนบุรี, กำแพงเพชร + ORGANIZATION กรมวิทยาศาสตร์การแพทย์, อย. + PERSON น.พ.จรัล, นางประนอม ทองจันทร์ + PHONE 1200, 0 2670 8888 + URL http://www.bangkokhealth.com/ + ZIP 10400, 11130 + Money 2.7 ล้านบาท, 2,000 บาท + LAW พ.ร.บ.โรคระบาด พ.ศ.2499, รัฐธรรมนูญ +============================ ================================= + +Modules +------- + +.. autofunction:: pos_tag +.. autofunction:: pos_tag_sents +.. autofunction:: tag_provinces +.. autofunction:: chunk_parse +.. autoclass:: NER + :members: +.. autoclass:: NNER + :members: +.. autoclass:: pythainlp.tag.thainer.ThaiNameTagger + :members: get_ner +.. autofunction:: pythainlp.tag.tltk.get_ner + +Tagger Engines +-------------- + +perceptron +++++++++++ + +Perceptron tagger is a part-of-speech tagging using the averaged, structured perceptron algorithm. + +unigram ++++++++ + +Unigram tagger doesn't take the ordering of words in the list into account. + + +References +---------- + +.. [#Sornlertlamvanich_2000] Virach Sornlertlamvanich, Naoto Takahashi and Hitoshi Isahara. (2000). + Building a Thai Part-Of-Speech Tagged Corpus (ORCHID). + The Journal of the Acoustical Society of Japan (E), Vol.20, No.3, pp 189-198, May 1999. +.. [#Prachya_2020] Prachya Boonkwan and Vorapon Luantangsrisuk and Sitthaa Phaholphinyo and Kanyanat Kriengket and Dhanon Leenoi and Charun Phrombut and Monthika Boriboon and Krit Kosawat and Thepchai Supnithi. (2020). + The Annotation Guideline of LST20 Corpus. + arXiv:2008.05055 diff --git a/5.1/_sources/api/tokenize.rst.txt b/5.1/_sources/api/tokenize.rst.txt new file mode 100644 index 0000000..5fe02fd --- /dev/null +++ b/5.1/_sources/api/tokenize.rst.txt @@ -0,0 +1,173 @@ +.. currentmodule:: pythainlp.tokenize +.. _tokenize-doc: + +pythainlp.tokenize +================== +The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions and classes for tokenizing Thai text into various units, such as sentences, words, subwords, and more. This module is a fundamental component of the PyThaiNLP library, providing tools for natural language processing in the Thai language. + +Modules +------- + + +.. autofunction:: sent_tokenize + :noindex: + + Splits Thai text into sentences. This function identifies sentence boundaries, which is essential for text segmentation and analysis. + +.. autofunction:: paragraph_tokenize + :noindex: + + Segments text into paragraphs, which can be valuable for document-level analysis or summarization. + +.. autofunction:: subword_tokenize + :noindex: + + Tokenizes text into subwords, which can be helpful for various NLP tasks, including subword embeddings. + +.. autofunction:: syllable_tokenize + :noindex: + + Divides text into syllables, allowing you to work with individual Thai language phonetic units. + +.. autofunction:: word_tokenize + :noindex: + + Splits text into words. This function is a fundamental tool for Thai language text analysis. + +.. autofunction:: word_detokenize + :noindex: + + Reverses the tokenization process, reconstructing text from tokenized units. Useful for text generation tasks. + +.. autoclass:: Tokenizer + :members: + + The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs. + +.. autoclass:: display_cell_tokenize + +Tokenization Engines +-------------------- + +This module offers multiple tokenization engines designed for different levels of text analysis. + +Sentence level +-------------- + +**crfcut** + +.. automodule:: pythainlp.tokenize.crfcut + :members: + + A tokenizer that operates at the sentence level using Conditional Random Fields (CRF). It is suitable for segmenting text into sentences accurately. + +**thaisumcut** + +.. automodule:: pythainlp.tokenize.thaisumcut + :members: + + A sentence tokenizer based on a maximum entropy model. It's a great choice for sentence boundary detection in Thai text. + +Word level +---------- + +**attacut** + +.. automodule:: pythainlp.tokenize.attacut + :members: + + A tokenizer designed for word-level segmentation. It provides accurate word boundary detection in Thai text. + +**deepcut** + +.. automodule:: pythainlp.tokenize.deepcut + :members: + + Utilizes deep learning techniques for word segmentation, achieving high accuracy and performance. + +**multi_cut** + +.. automodule:: pythainlp.tokenize.multi_cut + :members: + + An ensemble tokenizer that combines multiple tokenization strategies for improved word segmentation. + +**nlpo3** + +.. automodule:: pythainlp.tokenize.nlpo3 + :members: + + A word tokenizer based on the NLPO3 model. It offers advanced word boundary detection and is suitable for various NLP tasks. + +**longest** + +.. automodule:: pythainlp.tokenize.longest + :members: + + A tokenizer that identifies word boundaries by selecting the longest possible words in a text. + +**pyicu** + +.. automodule:: pythainlp.tokenize.pyicu + :members: + + An ICU-based word tokenizer offering robust support for Thai text segmentation. + +**nercut** + +.. automodule:: pythainlp.tokenize.nercut + :members: + + A tokenizer optimized for Named Entity Recognition (NER) tasks, ensuring accurate tokenization for entity recognition. + +**sefr_cut** + +.. automodule:: pythainlp.tokenize.sefr_cut + :members: + + An advanced word tokenizer for segmenting Thai text, with a focus on precision. + +**oskut** + +.. automodule:: pythainlp.tokenize.oskut + :members: + + A tokenizer that uses a pre-trained model for word segmentation. It's a reliable choice for general-purpose text analysis. + +**newmm (Default)** + +.. automodule:: pythainlp.tokenize.newmm + :members: + + The default word tokenization engine that provides a balance between accuracy and efficiency for most use cases. + +Subword level +------------- + +**tcc** + +.. automodule:: pythainlp.tokenize.tcc + :members: + + Tokenizes text into Thai Character Clusters (TCCs), a subword level representation. + +**tcc+** + +.. automodule:: pythainlp.tokenize.tcc_p + :members: + + A subword tokenizer that includes additional rules for more precise subword segmentation. + +**etcc** + +.. automodule:: pythainlp.tokenize.etcc + :members: + + Enhanced Thai Character Clusters (eTCC) tokenizer for subword-level analysis. + +**han_solo** + +.. automodule:: pythainlp.tokenize.han_solo + :members: + + A subword tokenizer specialized for Han characters and mixed scripts, suitable for various text processing scenarios. diff --git a/5.1/_sources/api/tools.rst.txt b/5.1/_sources/api/tools.rst.txt new file mode 100644 index 0000000..8b31ecc --- /dev/null +++ b/5.1/_sources/api/tools.rst.txt @@ -0,0 +1,30 @@ +.. currentmodule:: pythainlp.tools + +pythainlp.tools +=============== +The :mod:`pythainlp.tools` module encompasses a collection of miscellaneous functions primarily designed for internal use within the PyThaiNLP library. While these functions may not be directly exposed for external use, understanding their purpose can offer insights into the inner workings of PyThaiNLP. + +Modules +------- + +.. autofunction:: get_full_data_path + :noindex: + + Retrieves the full path to the PyThaiNLP data directory. This function is essential for internal data management, enabling PyThaiNLP to locate resources efficiently. + +.. autofunction:: get_pythainlp_data_path + :noindex: + + Obtains the path to the PyThaiNLP data directory. This function is useful for accessing the library's data resources for internal processes. + +.. autofunction:: get_pythainlp_path + :noindex: + + Returns the path to the PyThaiNLP library directory. This function is vital for PyThaiNLP's internal operations and library management. + +.. autofunction:: pythainlp.tools.misspell.misspell + :noindex: + + This module appears to be related to handling misspellings within PyThaiNLP. While not explicitly documented here, it likely provides functionality for identifying and correcting misspelled words, which can be crucial for text preprocessing and language processing tasks. + +The `pythainlp.tools` module contains these functions, which are mainly intended for PyThaiNLP's internal workings. While they may not be directly utilized by external users, they play a pivotal role in ensuring the smooth operation of the library. Understanding the purpose of these functions can be valuable for contributors and developers working on PyThaiNLP, as it sheds light on the internal mechanisms and data management within the library. diff --git a/5.1/_sources/api/translate.rst.txt b/5.1/_sources/api/translate.rst.txt new file mode 100644 index 0000000..bd5ec4a --- /dev/null +++ b/5.1/_sources/api/translate.rst.txt @@ -0,0 +1,45 @@ +.. currentmodule:: pythainlp.translate + +pythainlp.translate +=================== +The :mod:`pythainlp.translate` module is dedicated to machine translation capabilities for the PyThaiNLP library. It provides tools for translating text between different languages, making it a valuable resource for natural language processing tasks. + +Modules +------- + +.. autoclass:: Translate + :members: + + The `Translate` class is the central component of the module, offering a unified interface for various translation tasks. It acts as a coordinator, directing translation requests to specific language pairs and models. + +.. autofunction:: pythainlp.translate.en_th.download_model_all + :noindex: + + This function facilitates the download of all available English to Thai translation models. It ensures that the required models are accessible for translation tasks, enhancing the usability of the module. + +.. autoclass:: pythainlp.translate.en_th.EnThTranslator + :members: + + The `EnThTranslator` class specializes in translating text from English to Thai. It offers a range of methods for translating sentences and text, enabling accurate and meaningful translations between these languages. + +.. autoclass:: pythainlp.translate.en_th.ThEnTranslator + :members: + + Conversely, the `ThEnTranslator` class focuses on translating text from Thai to English. It provides functionality for translating Thai text into English, contributing to effective language understanding and communication. + +.. autoclass:: pythainlp.translate.zh_th.ThZhTranslator + :members: + + The `ThZhTranslator` class specializes in translating text from Thai to Chinese (Simplified). This class is valuable for bridging language gaps between these two languages, promoting cross-cultural communication. + +.. autoclass:: pythainlp.translate.zh_th.ZhThTranslator + :members: + + The `ZhThTranslator` class is designed for translating text from Chinese (Simplified) to Thai. It assists in making content accessible to Thai-speaking audiences by converting Chinese text into Thai. + +.. autoclass:: pythainlp.translate.th_fr.ThFrTranslator + :members: + + Lastly, the `ThFrTranslator` class specializes in translating text from Thai to French. It serves as a tool for expanding language accessibility and promoting content sharing in French-speaking communities. + +.. The `pythainlp.translate` module extends the language processing capabilities of PyThaiNLP, offering machine translation functionality for various language pairs. Whether you need to translate text between English and Thai, Thai and Chinese, or Thai and French, this module provides the necessary tools and classes to facilitate seamless language conversion. The `Translate` class acts as the central coordinator, while language-specific classes ensure accurate and meaningful translations for diverse linguistic scenarios. diff --git a/5.1/_sources/api/transliterate.rst.txt b/5.1/_sources/api/transliterate.rst.txt new file mode 100644 index 0000000..6222e9c --- /dev/null +++ b/5.1/_sources/api/transliterate.rst.txt @@ -0,0 +1,67 @@ +.. currentmodule:: pythainlp.transliterate + +pythainlp.transliterate +======================= +The :mod:`pythainlp.transliterate` module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks. + +Modules +------- + +.. autofunction:: romanize + :noindex: + + The `romanize` function allows you to transliterate Thai text, converting it into a phonetic representation using the English alphabet. It's a fundamental tool for rendering Thai words and phrases in a more familiar format. + +.. autofunction:: transliterate + :noindex: + + The `transliterate` function serves as a versatile transliteration tool, offering a range of transliteration engines to choose from. It provides flexibility and customization for your transliteration needs. + +.. autofunction:: pronunciate + :noindex: + + This function provides assistance in generating phonetic representations of Thai words, which is particularly useful for language learning and pronunciation practice. + +.. autofunction:: puan + :noindex: + + The `puan` function offers a unique transliteration feature known as "Puan." It provides a specialized transliteration method for Thai text and is an additional option for rendering Thai text into English characters. + +.. autoclass:: pythainlp.transliterate.wunsen.WunsenTransliterate + :members: + + The `WunsenTransliterate` class represents a transliteration engine known as "Wunsen." It offers specific transliteration methods for rendering Thai text into a phonetic English format. + +Transliteration Engines +----------------------- + +**thai2rom** + +.. autofunction:: pythainlp.transliterate.thai2rom.romanize + + The `thai2rom` engine specializes in transliterating Thai text into romanized form. It's particularly useful for rendering Thai words accurately in an English phonetic format. + +**royin** + +.. autofunction:: pythainlp.transliterate.royin.romanize + + The `royin` engine focuses on transliterating Thai text into English characters. It provides an alternative approach to transliteration, ensuring accurate representation of Thai words. + +**Transliterate Engines** + +This section includes multiple transliteration engines designed to suit various use cases. They offer unique methods for transliterating Thai text into romanized form: + +- **icu**: Utilizes the ICU transliteration system for phonetic conversion. +- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text. +- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. +- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0 +- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration. +- **iso_11940**: Focuses on the ISO 11940 transliteration standard. + +References +---------- + +.. [#rtgs_transcription] Nitaya Kanchanawan. (2006). `Romanization, Transliteration, and Transcription for the Globalization of the Thai Language. `_ + The Journal of the Royal Institute of Thailand. + +The `pythainlp.transliterate` module offers a comprehensive set of tools and engines for transliterating Thai text into Romanized form. Whether you need a simple transliteration, specific engines for accurate representation, or phonetic rendering, this module provides a wide range of options. Additionally, the module references a publication that highlights the significance of Romanization, Transliteration, and Transcription in making the Thai language accessible to a global audience. diff --git a/5.1/_sources/api/ulmfit.rst.txt b/5.1/_sources/api/ulmfit.rst.txt new file mode 100644 index 0000000..d9b1856 --- /dev/null +++ b/5.1/_sources/api/ulmfit.rst.txt @@ -0,0 +1,90 @@ +.. currentmodule:: pythainlp.ulmfit + +pythainlp.ulmfit +==================================== +Welcome to the `pythainlp.ulmfit` module, where you'll find powerful tools for Universal Language Model Fine-tuning for Text Classification (ULMFiT). ULMFiT is a cutting-edge technique for training deep learning models on large text corpora and then fine-tuning them for specific text classification tasks. + +Modules +------- + +.. autoclass:: ThaiTokenizer + :members: + + The `ThaiTokenizer` class is a critical component of ULMFiT, designed for tokenizing Thai text effectively. Tokenization is the process of breaking down text into individual tokens, and this class allows you to do so with precision and accuracy. + +.. autofunction:: document_vector + :noindex: + + The `document_vector` function is a powerful tool that computes document vectors for text data. This functionality is often used in text classification tasks where you need to represent documents as numerical vectors for machine learning models. + +.. autofunction:: fix_html + :noindex: + + The `fix_html` function is a text preprocessing utility that handles HTML-specific characters, making text cleaner and more suitable for text classification. + +.. autofunction:: lowercase_all + :noindex: + + The `lowercase_all` function is a text processing utility that converts all text to lowercase. This is useful for ensuring uniformity in text data and reducing the complexity of text classification tasks. + +.. autofunction:: merge_wgts + :noindex: + + The `merge_wgts` function is a tool for merging weight arrays, which can be crucial for managing and fine-tuning deep learning models in ULMFiT. + +.. autofunction:: process_thai + :noindex: + + The `process_thai` function is designed for preprocessing Thai text data, a vital step in preparing text for ULMFiT-based text classification. + +.. autofunction:: rm_brackets + :noindex: + + The `rm_brackets` function removes brackets from text, making it more suitable for text classification tasks that don't require bracket information. + +.. autofunction:: rm_useless_newlines + :noindex: + + The `rm_useless_newlines` function eliminates unnecessary newlines in text data, ensuring that text is more compact and easier to work with in ULMFiT-based text classification. + +.. autofunction:: rm_useless_spaces + :noindex: + + The `rm_useless_spaces` function removes extraneous spaces from text, making it cleaner and more efficient for ULMFiT-based text classification. + +.. autofunction:: remove_space + :noindex: + + The `remove_space` function is a utility for removing space characters from text data, streamlining the text for classification purposes. + +.. autofunction:: replace_rep_after + :noindex: + + The `replace_rep_after` function is a text preprocessing tool for replacing repeated characters in text with a single occurrence. This step helps in standardizing text data for text classification. + +.. autofunction:: replace_rep_nonum + :noindex: + + The `replace_rep_nonum` function is similar to `replace_rep_after`, but it focuses on replacing repeated characters without considering numbers. + +.. autofunction:: replace_wrep_post + :noindex: + + The `replace_wrep_post` function is used for replacing repeated words in text with a single occurrence. This function helps in reducing redundancy in text data, making it more efficient for text classification tasks. + +.. autofunction:: replace_wrep_post_nonum + :noindex: + + Similar to `replace_wrep_post`, the `replace_wrep_post_nonum` function removes repeated words without considering numbers in the text. + +.. autofunction:: spec_add_spaces + :noindex: + + The `spec_add_spaces` function is a text processing tool for adding spaces between special characters in text data. This step helps in standardizing text for ULMFiT-based text classification. + +.. autofunction:: ungroup_emoji + :noindex: + + The `ungroup_emoji` function is designed for ungrouping emojis in text data, which can be crucial for emoji recognition and classification tasks. + +.. The `pythainlp.ulmfit` module provides a comprehensive set of tools for ULMFiT-based text classification. Whether you need to preprocess Thai text, tokenize it, compute document vectors, or perform various text cleaning tasks, this module has the utilities you need. ULMFiT is a state-of-the-art technique in NLP, and these tools empower you to use it effectively for text classification. diff --git a/5.1/_sources/api/util.rst.txt b/5.1/_sources/api/util.rst.txt new file mode 100644 index 0000000..9a15547 --- /dev/null +++ b/5.1/_sources/api/util.rst.txt @@ -0,0 +1,307 @@ +.. currentmodule:: pythainlp.util + +pythainlp.util +============== +The :mod:`pythainlp.util` module serves as a treasure trove of utility functions designed to aid text conversion, formatting, and various language processing tasks in the context of Thai language. + +Modules +------- + +.. autofunction:: abbreviation_to_full_text + :noindex: + + The `abbreviation_to_full_text` function is a text processing tool for converting common Thai abbreviations into their full, expanded forms. It's invaluable for improving text readability and clarity. + +.. autofunction:: arabic_digit_to_thai_digit + :noindex: + + The `arabic_digit_to_thai_digit` function allows you to transform Arabic numerals into their Thai numeral equivalents. This utility is especially useful when working with Thai numbers in text data. + +.. autofunction:: bahttext + :noindex: + + The `bahttext` function specializes in converting numerical values into Thai Baht text, an essential feature for rendering financial data or monetary amounts in a user-friendly Thai format. + +.. autofunction:: convert_years + :noindex: + + The `convert_years` function is designed to facilitate the conversion of Western calendar years into Thai Buddhist Era (BE) years. This is significant for presenting dates and years in a Thai context. + +.. autofunction:: collate + :noindex: + + The `collate` function is a versatile tool for sorting Thai text in a locale-specific manner. It ensures that text data is sorted correctly, taking into account the Thai language's unique characteristics. + +.. autofunction:: count_thai_chars + :noindex: + + The `count_thai_chars` function is a character counting tool specifically tailored for Thai text. It helps in quantifying Thai characters, which can be useful for various text processing tasks. + +.. autofunction:: countthai + :noindex: + + The `countthai` function is a text processing utility for counting the occurrences of Thai characters in text data. This is useful for understanding the prevalence of Thai language content. + +.. autofunction:: dict_trie + :noindex: + + The `dict_trie` function implements a Trie data structure for efficient dictionary operations. It's a valuable resource for dictionary management and fast word lookup. + +.. autofunction:: digit_to_text + :noindex: + + The `digit_to_text` function is a numeral conversion tool that translates Arabic numerals into their Thai textual representations. This is vital for rendering numbers in Thai text naturally. + +.. autofunction:: display_thai_char + :noindex: + + The `display_thai_char` function is designed to present Thai characters with diacritics and tonal marks accurately. This is essential for displaying Thai text with correct pronunciation cues. + +.. autofunction:: emoji_to_thai + :noindex: + + The `emoji_to_thai` function focuses on converting emojis into their Thai language equivalents. This is a unique feature for enhancing text communication with Thai-language emojis. + +.. autofunction:: eng_to_thai + :noindex: + + The `eng_to_thai` function serves as a text conversion tool for translating English text into its Thai transliterated form. It is beneficial for rendering English words and phrases in a Thai context. + +.. autofunction:: find_keyword + :noindex: + + The `find_keyword` function is a powerful utility for identifying keywords and key phrases in text data. It is a fundamental component for text analysis and information extraction tasks. + +.. autofunction:: ipa_to_rtgs + :noindex: + + The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides. + +.. autofunction:: isthai + :noindex: + + The `isthai` function is a straightforward language detection utility that determines if text contains Thai language content. This function is essential for language-specific text processing. + +.. autofunction:: isthaichar + :noindex: + + The `isthaichar` function is designed to check if a character belongs to the Thai script. It helps in character-level language identification and text processing. + +.. autofunction:: maiyamok + :noindex: + + The `maiyamok` function is a text processing tool that assists in identifying and processing Thai character characters with a 'mai yamok' tone mark. + +.. autofunction:: nectec_to_ipa + :noindex: + + The `nectec_to_ipa` function focuses on converting text from the NECTEC phonetic transcription system to the International Phonetic Alphabet (IPA). This conversion is vital for linguistic analysis and phonetic representation. + +.. autofunction:: normalize + :noindex: + + The `normalize` function is a text processing utility that standardizes text by removing diacritics, tonal marks, and other modifications. It is valuable for text normalization and linguistic analysis. + +.. autofunction:: now_reign_year + :noindex: + + The `now_reign_year` function computes the current Thai Buddhist Era (BE) year and provides it in a human-readable format. This function is essential for displaying the current year in a Thai context. + +.. autofunction:: num_to_thaiword + :noindex: + + The `num_to_thaiword` function is a numeral conversion tool for translating Arabic numerals into Thai word form. It is crucial for rendering numbers in a natural Thai textual format. + +.. autofunction:: rank + :noindex: + + The `rank` function is designed for ranking and ordering a list of items. It is a general-purpose utility for ranking items based on various criteria. + +.. autofunction:: reign_year_to_ad + :noindex: + + The `reign_year_to_ad` function facilitates the conversion of Thai Buddhist Era (BE) years into Western calendar years. This is useful for displaying historical dates in a globally recognized format. + +.. autofunction:: remove_dangling + :noindex: + + The `remove_dangling` function is a text processing tool for removing dangling characters or diacritics from text. It is useful for text cleaning and normalization. + +.. autofunction:: remove_dup_spaces + :noindex: + + The `remove_dup_spaces` function focuses on removing duplicate space characters from text data, making it more consistent and readable. + +.. autofunction:: remove_repeat_vowels + :noindex: + + The `remove_repeat_vowels` function is designed to eliminate repeated vowel characters in text, improving text readability and consistency. + +.. autofunction:: remove_tone_ipa + :noindex: + + The `remove_tone_ipa` function serves as a phonetic conversion tool for removing tone marks from IPA transcriptions. This is crucial for phonetic analysis and linguistic research. + +.. autofunction:: remove_tonemark + :noindex: + + The `remove_tonemark` function is a utility for removing tonal marks and diacritics from text data, making it suitable for various text processing tasks. + +.. autofunction:: remove_zw + :noindex: + + The `remove_zw` function is designed to remove zero-width characters from text data, ensuring that text is free from invisible or unwanted characters. + +.. autofunction:: reorder_vowels + :noindex: + + The `reorder_vowels` function is a text processing utility for reordering vowel characters in Thai text. It is essential for phonetic analysis and pronunciation guides. + +.. autofunction:: rhyme + :noindex: + + + The `rhyme` function is a utility for find rhyme of Thai word. + +.. autofunction:: sound_syllable + :noindex: + + The `sound_syllable` function specializes in identifying and processing Thai characters that represent sound syllables. This is valuable for phonetic and linguistic analysis. + +.. autofunction:: syllable_length + :noindex: + + The `syllable_length` function is a text analysis tool for calculating the length of syllables in Thai text. It is significant for linguistic analysis and language research. + +.. autofunction:: syllable_open_close_detector + :noindex: + + The `syllable_open_close_detector` function is designed to detect syllable open and close statuses in Thai text. This information is vital for phonetic analysis and linguistic research. + +.. autofunction:: text_to_arabic_digit + :noindex: + + The `text_to_arabic_digit` function is a numeral conversion tool that translates Thai text numerals into Arabic numeral form. It is useful for numerical data extraction and processing. + +.. autofunction:: text_to_num + :noindex: + + The `text_to_num` function focuses on extracting numerical values from text data. This is essential for converting textual numbers into numerical form for computation. + +.. autofunction:: text_to_thai_digit + :noindex: + + The `text_to_thai_digit` function serves as a numeral conversion tool for translating Arabic numerals into Thai numeral form. This is important for rendering numbers in Thai text naturally. + +.. autofunction:: thai_digit_to_arabic_digit + :noindex: + + The `thai_digit_to_arabic_digit` function allows you to transform Thai numeral text into Arabic numeral format. This is valuable for numerical data extraction and computation tasks. + +.. autofunction:: thai_strftime + :noindex: + + The `thai_strftime` function is a date formatting tool tailored for Thai culture. It is essential for displaying dates and times in a format that adheres to Thai conventions. + +.. autofunction:: thai_strptime + :noindex: + + The `thai_strptime` function focuses on parsing dates and times in a Thai-specific format, making it easier to work with date and time data in a Thai context. + +.. autofunction:: thai_to_eng + :noindex: + + The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. + +.. autofunction:: to_idna + :noindex: + + The `to_idna` function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name. + +.. autofunction:: thai_word_tone_detector + :noindex: + + The `thai_word_tone_detector` function specializes in detecting and processing tonal marks in Thai words. It is essential for phonetic analysis and pronunciation guides. + +.. autofunction:: thaiword_to_date + :noindex: + + The `thaiword_to_date` function facilitates the conversion of Thai word representations of dates into standardized date formats. This is important for date data extraction and processing. + +.. autofunction:: thaiword_to_num + :noindex: + + The `thaiword_to_num` function is a numeral conversion tool for translating Thai word numerals into numerical form. This is essential for numerical data extraction and computation. + +.. autofunction:: thaiword_to_time + :noindex: + + The `thaiword_to_time` function is designed for converting Thai word representations of time into standardized time formats. It is crucial for time data extraction and processing. + +.. autofunction:: time_to_thaiword + :noindex: + + The `time_to_thaiword` function focuses on converting time values into Thai word representations. This is valuable for rendering time in a natural Thai textual format. + +.. autofunction:: tis620_to_utf8 + :noindex: + + The `tis620_to_utf8` function serves as a character encoding conversion tool for converting TIS-620 encoded text into UTF-8 format. This is significant for character encoding compatibility. + +.. autofunction:: tone_detector + :noindex: + + The `tone_detector` function is a text processing tool for detecting tone marks and diacritics in Thai text. It is essential for phonetic analysis and pronunciation guides. + +.. autofunction:: words_to_num + :noindex: + + The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation. + +.. autofunction:: spelling + :noindex: + The `spelling` function is a text processing tool for spelling Thai word. + +.. autofunction:: thai_consonant_to_spelling + +.. autofunction:: tone_to_spelling + +.. autofunction:: pythainlp.util.spell_words.spell_syllable + :noindex: + + The `pythainlp.util.spell_words.spell_syllable` function focuses on spelling syllables in Thai text, an important feature for phonetic analysis and linguistic research. + +.. autofunction:: pythainlp.util.spell_words.spell_word + :noindex: + + The `pythainlp.util.spell_words.spell_word` function is designed for spelling individual words in Thai text, facilitating phonetic analysis and pronunciation guides. + +.. autofunction:: to_lunar_date + :noindex: + + The `to_lunar_date` function focuses on converts the solar date to Thai Lunar Date. + +.. autofunction:: th_zodiac + :noindex: + + The `th_zodiac` function is converts a Gregorian year to its corresponding Thai Zodiac name. + +.. autoclass:: Trie + :members: + + The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner. + +.. autofunction:: longest_common_subsequence + :noindex: + + The `longest_common_subsequence` function is find the longest common subsequence between two strings. + +.. autofunction:: pythainlp.util.morse.morse_encode + :noindex: + + The `pythainlp.util.morse.morse_encode` function is convert text to Morse code. + +.. autofunction:: pythainlp.util.morse.morse_decode + :noindex: + + The `pythainlp.util.morse.morse_decode` function is convert Morse code to text. diff --git a/5.1/_sources/api/wangchanberta.rst.txt b/5.1/_sources/api/wangchanberta.rst.txt new file mode 100644 index 0000000..7162dbf --- /dev/null +++ b/5.1/_sources/api/wangchanberta.rst.txt @@ -0,0 +1,50 @@ +.. currentmodule:: pythainlp.wangchanberta + +pythainlp.wangchanberta +======================= +The `pythainlp.wangchanberta` module is built upon the WangchanBERTa base model, specifically the `wangchanberta-base-att-spm-uncased` model, as detailed in the paper by Lowphansirikul et al. [^Lowphansirikul_2021]. + +This base model is utilized for various natural language processing tasks in the Thai language, including named entity recognition, part-of-speech tagging, and subword tokenization. + +If you intend to fine-tune the model or explore its capabilities further, please refer to the [thai2transformers repository](https://github.com/vistec-AI/thai2transformers). + +**Speed Benchmark** + +============================= ======================== ============== +Function Named Entity Recognition Part of Speech +============================= ======================== ============== +PyThaiNLP basic function 89.7 ms 312 ms +pythainlp.wangchanberta (CPU) 9.64 s 9.65 s +pythainlp.wangchanberta (GPU) 8.02 s 8 s +============================= ======================== ============== + +For a comprehensive performance benchmark, the following notebooks are available: + +- `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google + Colab`_ +- `pythainlp.wangchanberta GPU`_ + +.. _PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab: https://colab.research.google.com/drive/1ymTVB1UESXAyZlSpjknCb72xpdcZ86Db?usp=sharing +.. _pythainlp.wangchanberta GPU: https://colab.research.google.com/drive/1AtkFT1HMGL2GO7O2tM_hi_7mExKwmhMw?usp=sharing + +Modules +------- +.. autoclass:: NamedEntityRecognition + :members: + + The `NamedEntityRecognition` class is a fundamental component for identifying named entities in Thai text. It allows you to extract entities such as names, locations, and organizations from text data. + +.. autoclass:: ThaiNameTagger + :members: + + The `ThaiNameTagger` class is designed for tagging Thai names within text. This is essential for tasks such as entity recognition, information extraction, and text classification. + +.. autofunction:: segment + :noindex: + + The `segment` function is a subword tokenization tool that breaks down text into subword units, offering a foundation for further text processing and analysis. + +References +---------- + +[^Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. WangchanBERTa: Pretraining transformer-based Thai Language Models. [ArXiv:2101.09635](http://arxiv.org/abs/2101.09635) [Internet]. 2021 Jan 23 [cited 2021 Feb 27]. diff --git a/5.1/_sources/api/word_vector.rst.txt b/5.1/_sources/api/word_vector.rst.txt new file mode 100644 index 0000000..107328e --- /dev/null +++ b/5.1/_sources/api/word_vector.rst.txt @@ -0,0 +1,28 @@ +.. currentmodule:: pythainlp.word_vector + +pythainlp.word_vector +======================= +The :class:`word_vector` contains functions that makes use of a pre-trained vector public data. +The `pythainlp.word_vector` module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more. + +Dependencies +------------ +Installation of :mod:`numpy` and :mod:`gensim` is required. + +Before using this module, you need to ensure that the `numpy` and `gensim` libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors. + +Modules +------- + +.. autoclass:: WordVector + :members: + + The `WordVector` class encapsulates word vector operations and functions. It provides a convenient interface for loading models, finding word similarities, and generating sentence vectors. + +References +---------- + +- [Omer Levy and Yoav Goldberg (2014). Linguistic Regularities in Sparse and Explicit Word Representations](https://www.aclweb.org/anthology/W14-1618/) + This reference points to the work by Omer Levy and Yoav Goldberg, which discusses linguistic regularities in word representations. It underlines the theoretical foundation of word vectors and their applications in NLP. + +This enhanced documentation provides a more detailed and organized overview of the `pythainlp.word_vector` module, making it a valuable resource for NLP practitioners and researchers working with pre-trained word vectors in the Thai language. diff --git a/5.1/_sources/api/wsd.rst.txt b/5.1/_sources/api/wsd.rst.txt new file mode 100644 index 0000000..d260faf --- /dev/null +++ b/5.1/_sources/api/wsd.rst.txt @@ -0,0 +1,16 @@ +.. currentmodule:: pythainlp.wsd + +pythainlp.wsd +============= +The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD). +The `pythainlp.wsd` module is designed to assist in Word Sense Disambiguation (WSD) for the Thai language. Word Sense Disambiguation is a crucial task in natural language processing that involves determining the correct sense or meaning of a word within a given context. This module provides a function for achieving precisely that. + +Modules +------- +.. autofunction:: get_sense + + The `get_sense` function is the primary tool within this module for performing Word Sense Disambiguation in Thai text. Given a word and its context, this function returns the most suitable sense or meaning for that word. This is particularly useful for tasks where word sense ambiguity needs to be resolved, such as text understanding and translation. + +By using the `pythainlp.wsd` module, you can enhance the accuracy of your NLP applications when dealing with Thai text, ensuring that words are interpreted in the correct context. + +.. This improved documentation offers a clear and concise explanation of the purpose of the `pythainlp.wsd` module and its primary function, `get_sense`, in the context of Word Sense Disambiguation. It helps users understand the module's utility in disambiguating word senses within the Thai language, which is valuable for a wide range of NLP applications. diff --git a/5.1/_sources/index.rst.txt b/5.1/_sources/index.rst.txt new file mode 100644 index 0000000..ce80d16 --- /dev/null +++ b/5.1/_sources/index.rst.txt @@ -0,0 +1,47 @@ +.. PyThaiNLP documentation master file, created by + sphinx-quickstart on Sat Jun 23 15:23:30 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +PyThaiNLP documentation +===================================== +.. figure:: ./images/logo.png + :scale: 50 % + +PyThaiNLP is a Python library for Thai natural language processing (NLP). + +Website: `PyThaiNLP.github.io `_ + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Notes + + notes/* + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Package reference: + + api/* + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +Citations +========= +If you use PyThaiNLP in your project or publication, please cite the library as follows + + Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. PyThaiNLP: Thai Natural Language Processing in Python. In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing. + +Apache Software License 2.0 + +Maintained by the PyThaiNLP team. diff --git a/5.1/_sources/notes/FAQ.rst.txt b/5.1/_sources/notes/FAQ.rst.txt new file mode 100644 index 0000000..1a9fb1c --- /dev/null +++ b/5.1/_sources/notes/FAQ.rst.txt @@ -0,0 +1,6 @@ +FAQ +=== + +*Frequently Asked Questions about PyThaiNLP* + +You can read the FAQ at `FAQ | PyThaiNLP GitHub `_ diff --git a/5.1/_sources/notes/command_line.rst.txt b/5.1/_sources/notes/command_line.rst.txt new file mode 100644 index 0000000..a0e9156 --- /dev/null +++ b/5.1/_sources/notes/command_line.rst.txt @@ -0,0 +1,108 @@ +Command Line +============ + +You can use some thainlp functions directly from command line. + +**Tokenization**:: + + thainlp tokenize [-w] [-nw] [-a newmm|attacut|longest] [-s SEPARATOR] TEXT + +*Example*:: + + $ thainlp tokenize word สภาพการจ้างและสภาพการทำงาน + สภาพการจ้าง|และ|สภาพ|การทำงาน| + + $ thainlp tokenize syllable สภาพการจ้างและสภาพการทำงาน + สภาพ~การ~จ้าง~และ~สภาพ~การ~ทำ~งาน~ + + $ thainlp tokenize subword สภาพการจ้างและสภาพการทำงาน + ส/ภา/พ/กา/ร/จ้า/ง/และ/ส/ภา/พ/กา/ร/ทำ/งา/น/ + + $ thainlp tokenize word -a longest "แรงงานกะดึก: ฟันเฟืองที่ยังหมุนในคำ่คืนมีเคอร์ฟิว" + แรงงาน|กะ|ดึก|:| |ฟันเฟือง|ที่|ยัง|หมุน|ใน|คำ่|คืน|มี|เคอร์ฟิว| + + $ thainlp tokenize word -nw -s "##" "5 เหตุผล 'ไม่ควร' ต่อพ.ร.ก.ฉุกเฉิน" + 5##เหตุผล##'##ไม่##ควร##'##ต่อ##พ.ร.ก.##ฉุกเฉิน## + + $ thainlp tokenize sent "หลายปีที่ผ่านมา ชาวชุมชนโคกยาวหลายคนได้พากันย้ายออก บ้างก็เสียชีวิต บางคนถูกจำคุกในข้อบุกรุกป่าหรือแม้กระทั่งสูญหาย" + หลายปีที่ผ่านมา @@ชาวชุมชนโคกยาวหลายคนได้พากันย้ายออก @@บ้างก็เสียชีวิต @@บางคนถูกจำคุกในข้อบุกรุกป่าหรือแม้กระทั่งสูญหาย@@ + +**Part-Of-Speech tagging**:: + + pythainlp tagg pos [-s SEPARATOR] TEXT + +*Example*:: + + $ thainlp tag pos -s . ผม.ไม่.กิน.เผ็ด + +**Soundex**:: + + thainlp soundex [-a udom83|lk82|metasound] TEXT + +*Example*:: + + $ thainlp soundex วรรณ + ว330000 + + $ thainlp soundex -a lk82 วัน + ว4000 + + $ thainlp soundex -a lk82 วรรณ + ว4000 + +**Corpus management**:: + + thainlp data + +*Example*:: + + $ thainlp data path + /Users/user1/pythainlp-data + + $ thainlp data catalog + Dataset/corpus available for download: + - crfcut 0.1 + - thai-g2p 0.1 (Local: 0.1) + - thai2fit_wv 0.1 + - thainer-1-3 1.3 + + $ thainlp data get thai2fit_wv + Corpus: thai2fit_wv + - Downloading: thai2fit_wv 0.1 + 36%|█████████████████▉ | + + $ thainlp data --help + +**Benchmark**:: + + thainlp benchmark word-tokenization --input-file --test-file

"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t a.language.name.localeCompare(b.language.name)); + + const languagesHTML = ` +
+
Languages
+ ${languages + .map( + (translation) => ` +
+ ${translation.language.code} +
+ `, + ) + .join("\n")} +
+ `; + return languagesHTML; + } + + function renderVersions(config) { + if (!config.versions.active.length) { + return ""; + } + const versionsHTML = ` +
+
Versions
+ ${config.versions.active + .map( + (version) => ` +
+ ${version.slug} +
+ `, + ) + .join("\n")} +
+ `; + return versionsHTML; + } + + function renderDownloads(config) { + if (!Object.keys(config.versions.current.downloads).length) { + return ""; + } + const downloadsNameDisplay = { + pdf: "PDF", + epub: "Epub", + htmlzip: "HTML", + }; + + const downloadsHTML = ` +
+
Downloads
+ ${Object.entries(config.versions.current.downloads) + .map( + ([name, url]) => ` +
+ ${downloadsNameDisplay[name]} +
+ `, + ) + .join("\n")} +
+ `; + return downloadsHTML; + } + + document.addEventListener("readthedocs-addons-data-ready", function (event) { + const config = event.detail.data(); + + const flyout = ` +
+ + Read the Docs + v: ${config.versions.current.slug} + + +
+
+ ${renderLanguages(config)} + ${renderVersions(config)} + ${renderDownloads(config)} +
+
On Read the Docs
+
+ Project Home +
+
+ Builds +
+
+ Downloads +
+
+
+
Search
+
+
+ +
+
+
+
+ + Hosted by Read the Docs + +
+
+ `; + + // Inject the generated flyout into the body HTML element. + document.body.insertAdjacentHTML("beforeend", flyout); + + // Trigger the Read the Docs Addons Search modal when clicking on the "Search docs" input from inside the flyout. + document + .querySelector("#flyout-search-form") + .addEventListener("focusin", () => { + const event = new CustomEvent("readthedocs-search-show"); + document.dispatchEvent(event); + }); + }) +} + +if (themeLanguageSelector || themeVersionSelector) { + function onSelectorSwitch(event) { + const option = event.target.selectedIndex; + const item = event.target.options[option]; + window.location.href = item.dataset.url; + } + + document.addEventListener("readthedocs-addons-data-ready", function (event) { + const config = event.detail.data(); + + const versionSwitch = document.querySelector( + "div.switch-menus > div.version-switch", + ); + if (themeVersionSelector) { + let versions = config.versions.active; + if (config.versions.current.hidden || config.versions.current.type === "external") { + versions.unshift(config.versions.current); + } + const versionSelect = ` + + `; + + versionSwitch.innerHTML = versionSelect; + versionSwitch.firstElementChild.addEventListener("change", onSelectorSwitch); + } + + const languageSwitch = document.querySelector( + "div.switch-menus > div.language-switch", + ); + + if (themeLanguageSelector) { + if (config.projects.translations.length) { + // Add the current language to the options on the selector + let languages = config.projects.translations.concat( + config.projects.current, + ); + languages = languages.sort((a, b) => + a.language.name.localeCompare(b.language.name), + ); + + const languageSelect = ` + + `; + + languageSwitch.innerHTML = languageSelect; + languageSwitch.firstElementChild.addEventListener("change", onSelectorSwitch); + } + else { + languageSwitch.remove(); + } + } + }); +} + +document.addEventListener("readthedocs-addons-data-ready", function (event) { + // Trigger the Read the Docs Addons Search modal when clicking on "Search docs" input from the topnav. + document + .querySelector("[role='search'] input") + .addEventListener("focusin", () => { + const event = new CustomEvent("readthedocs-search-show"); + document.dispatchEvent(event); + }); +}); \ No newline at end of file diff --git a/5.1/_static/language_data.js b/5.1/_static/language_data.js new file mode 100644 index 0000000..c7fe6c6 --- /dev/null +++ b/5.1/_static/language_data.js @@ -0,0 +1,192 @@ +/* + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, if available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/5.1/_static/minus.png b/5.1/_static/minus.png new file mode 100644 index 0000000..d96755f Binary files /dev/null and b/5.1/_static/minus.png differ diff --git a/5.1/_static/plus.png b/5.1/_static/plus.png new file mode 100644 index 0000000..7107cec Binary files /dev/null and b/5.1/_static/plus.png differ diff --git a/5.1/_static/pygments.css b/5.1/_static/pygments.css new file mode 100644 index 0000000..5f2b0a2 --- /dev/null +++ b/5.1/_static/pygments.css @@ -0,0 +1,75 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #eeffcc; } +.highlight .c { color: #408090; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #F00 } /* Error */ +.highlight .k { color: #007020; font-weight: bold } /* Keyword */ +.highlight .o { color: #666 } /* Operator */ +.highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #007020 } /* Comment.Preproc */ +.highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #408090; background-color: #FFF0F0 } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #F00 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #333 } /* Generic.Output */ +.highlight .gp { color: #C65D09; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #04D } /* Generic.Traceback */ +.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #007020 } /* Keyword.Pseudo */ +.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #902000 } /* Keyword.Type */ +.highlight .m { color: #208050 } /* Literal.Number */ +.highlight .s { color: #4070A0 } /* Literal.String */ +.highlight .na { color: #4070A0 } /* Name.Attribute */ +.highlight .nb { color: #007020 } /* Name.Builtin */ +.highlight .nc { color: #0E84B5; font-weight: bold } /* Name.Class */ +.highlight .no { color: #60ADD5 } /* Name.Constant */ +.highlight .nd { color: #555; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #D55537; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #007020 } /* Name.Exception */ +.highlight .nf { color: #06287E } /* Name.Function */ +.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ +.highlight .nn { color: #0E84B5; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #BB60D5 } /* Name.Variable */ +.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #BBB } /* Text.Whitespace */ +.highlight .mb { color: #208050 } /* Literal.Number.Bin */ +.highlight .mf { color: #208050 } /* Literal.Number.Float */ +.highlight .mh { color: #208050 } /* Literal.Number.Hex */ +.highlight .mi { color: #208050 } /* Literal.Number.Integer */ +.highlight .mo { color: #208050 } /* Literal.Number.Oct */ +.highlight .sa { color: #4070A0 } /* Literal.String.Affix */ +.highlight .sb { color: #4070A0 } /* Literal.String.Backtick */ +.highlight .sc { color: #4070A0 } /* Literal.String.Char */ +.highlight .dl { color: #4070A0 } /* Literal.String.Delimiter */ +.highlight .sd { color: #4070A0; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4070A0 } /* Literal.String.Double */ +.highlight .se { color: #4070A0; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #4070A0 } /* Literal.String.Heredoc */ +.highlight .si { color: #70A0D0; font-style: italic } /* Literal.String.Interpol */ +.highlight .sx { color: #C65D09 } /* Literal.String.Other */ +.highlight .sr { color: #235388 } /* Literal.String.Regex */ +.highlight .s1 { color: #4070A0 } /* Literal.String.Single */ +.highlight .ss { color: #517918 } /* Literal.String.Symbol */ +.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #06287E } /* Name.Function.Magic */ +.highlight .vc { color: #BB60D5 } /* Name.Variable.Class */ +.highlight .vg { color: #BB60D5 } /* Name.Variable.Global */ +.highlight .vi { color: #BB60D5 } /* Name.Variable.Instance */ +.highlight .vm { color: #BB60D5 } /* Name.Variable.Magic */ +.highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/5.1/_static/searchtools.js b/5.1/_static/searchtools.js new file mode 100644 index 0000000..2c774d1 --- /dev/null +++ b/5.1/_static/searchtools.js @@ -0,0 +1,632 @@ +/* + * Sphinx JavaScript utilities for the full-text search. + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename, kind] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms, highlightTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; + + const [docName, title, anchor, descr, score, _filename, kind] = item; + + let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = contentRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = contentRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) { + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms, anchor) + ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, + ).replace('${resultCount}', resultCount); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms, + highlightTerms, +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms, highlightTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString, anchor) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + for (const removalQuery of [".headerlink", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent) return docContent.textContent; + + console.warn( + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + _parseQuery: (query) => { + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. + const normalResults = []; + const nonMainIndexResults = []; + + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase().trim(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + const score = Math.round(Scorer.title * queryLower.length / title.length); + const boost = titles[file] === title ? 1 : 0; // add a boost for document titles + normalResults.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score + boost, + filenames[file], + SearchResultKind.title, + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + SearchResultKind.index, + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } + } + } + } + + // lookup as object + objectTerms.forEach((term) => + normalResults.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms, highlightTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + SearchResultKind.object, + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + SearchResultKind.text, + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/5.1/_static/sphinx_highlight.js b/5.1/_static/sphinx_highlight.js new file mode 100644 index 0000000..8a96c69 --- /dev/null +++ b/5.1/_static/sphinx_highlight.js @@ -0,0 +1,154 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); + parent.insertBefore( + span, + parent.insertBefore( + rest, + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/5.1/_static/style.css b/5.1/_static/style.css new file mode 100644 index 0000000..b07bdb1 --- /dev/null +++ b/5.1/_static/style.css @@ -0,0 +1,3 @@ +.wy-nav-content { + max-width: none; +} diff --git a/5.1/api/.ipynb_checkpoints/augment-checkpoint.html b/5.1/api/.ipynb_checkpoints/augment-checkpoint.html new file mode 100644 index 0000000..124cbd4 --- /dev/null +++ b/5.1/api/.ipynb_checkpoints/augment-checkpoint.html @@ -0,0 +1,606 @@ + + + + + + + + + pythainlp.augment — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.augment

+
+

Introduction

+

The pythainlp.augment module is a powerful toolset for text augmentation in the Thai language. Text augmentation is a process that enriches and diversifies textual data by generating alternative versions of the original text. This module is a valuable resource for improving the quality and variety of Thai language data for NLP tasks.

+
+
+

WordNetAug Class

+

The WordNetAug class is designed to perform text augmentation using WordNet, a lexical database for English. This class enables you to augment Thai text using English synonyms, offering a unique approach to text diversification. The following methods are available within this class:

+
+
+class pythainlp.augment.WordNetAug[source]
+

Text Augment using wordnet

+
+
+__init__()[source]
+
+ +
+
+find_synonyms(word: str, pos: str | None = None, postag_corpus: str = 'orchid') List[str][source]
+

Find synonyms using wordnet

+
+
Parameters:
+
    +
  • word (str) – word

  • +
  • pos (str) – part-of-speech type

  • +
  • postag_corpus (str) – name of POS tag corpus

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+augment(sentence: str, tokenize: object = <function word_tokenize>, max_syn_sent: int = 6, postag: bool = True, postag_corpus: str = 'orchid') List[List[str]][source]
+

Text Augment using wordnet

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • tokenize (object) – function for tokenizing words

  • +
  • max_syn_sent (int) – maximum number of synonymous sentences

  • +
  • postag (bool) – use part-of-speech

  • +
  • postag_corpus (str) – name of POS tag corpus

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment import WordNetAug
+
+aug = WordNetAug()
+aug.augment("เราชอบไปโรงเรียน")
+# output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'),
+ ('เรา', 'ชอบ', 'ไป', 'รร.'),
+ ('เรา', 'ชอบ', 'ไป', 'โรงเรียน'),
+ ('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'),
+ ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
+ ('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
+
+
+
+ +
+ +
+
+

Word2VecAug, Thai2fitAug, LTW2VAug Classes

+

The pythainlp.augment.word2vec package contains multiple classes for text augmentation using Word2Vec models. These classes include Word2VecAug, Thai2fitAug, and LTW2VAug. Each of these classes allows you to use Word2Vec embeddings to generate text variations. Explore the methods provided by these classes to understand their capabilities.

+
+
+class pythainlp.augment.word2vec.Word2VecAug(model: str, tokenize: object, type: str = 'file')[source]
+
+
+__init__(model: str, tokenize: object, type: str = 'file') None[source]
+
+
Parameters:
+
    +
  • model (str) – path of model

  • +
  • tokenize (object) – tokenize function

  • +
  • type (str) – model type (file, binary)

  • +
+
+
+
+ +
+
+modify_sent(sent: str, p: float = 0.7) List[List[str]][source]
+
+
Parameters:
+
    +
  • sent (str) – text of sentence

  • +
  • p (float) – probability

  • +
+
+
Return type:
+

List[List[str]]

+
+
+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+
+
Parameters:
+
    +
  • sentence (str) – text of sentence

  • +
  • n_sent (int) – maximum number of synonymous sentences

  • +
  • p (int) – probability

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
+
+ +
+ +
+
+class pythainlp.augment.word2vec.Thai2fitAug[source]
+

Text Augment using word2vec from Thai2Fit

+

Thai2Fit: +github.com/cstorm125/thai2fit

+
+
+__init__()[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load Thai2Fit’s word2vec model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from Thai2Fit

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of text augmented

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec import Thai2fitAug
+
+aug = Thai2fitAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: [('พวกเรา', 'เรียน'), ('ฉัน', 'เรียน')]
+
+
+
+ +
+ +
+
+class pythainlp.augment.word2vec.LTW2VAug[source]
+

Text Augment using word2vec from LTW2V

+

LTW2V: +github.com/PyThaiNLP/large-thaiword2vec

+
+
+__init__()[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load LTW2V’s word2vec model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from Thai2Fit

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of text augmented

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec import LTW2VAug
+
+aug = LTW2VAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: [('เขา', 'เรียนหนังสือ'), ('เขา', 'สมัครเรียน')]
+
+
+
+ +
+ +
+
+

FastTextAug and Thai2transformersAug Classes

+

The pythainlp.augment.lm package offers classes for text augmentation using language models. These classes include FastTextAug and Thai2transformersAug. These classes allow you to use language model-based techniques to diversify text data. Explore their methods to understand their capabilities.

+
+
+class pythainlp.augment.lm.FastTextAug(model_path: str)[source]
+

Text Augment from fastText

+
+
Parameters:
+

model_path (str) – path of model file

+
+
+
+
+__init__(model_path: str)[source]
+
+
Parameters:
+

model_path (str) – path of model file

+
+
+
+ +
+
+tokenize(text: str) List[str][source]
+

Thai text tokenization for fastText

+
+
Parameters:
+

text (str) – Thai text

+
+
Returns:
+

list of words

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+modify_sent(sent: str, p: float = 0.7) List[List[str]][source]
+
+
Parameters:
+
    +
  • sent (str) – text of sentence

  • +
  • p (float) – probability

  • +
+
+
Return type:
+

List[List[str]]

+
+
+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment from fastText

+

You may want to download the Thai model +from https://fasttext.cc/docs/en/crawl-vectors.html.

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentences

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
+
+ +
+ +
+
+class pythainlp.augment.lm.Thai2transformersAug[source]
+
+
+__init__()[source]
+
+ +
+
+generate(sentence: str, num_replace_tokens: int = 3)[source]
+
+ +
+
+augment(sentence: str, num_replace_tokens: int = 3) List[str][source]
+

Text augmentation from WangchanBERTa

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • num_replace_tokens (int) – number replace tokens

  • +
+
+
Returns:
+

list of text augment

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.augment.lm import Thai2transformersAug
+
+aug = Thai2transformersAug()
+
+aug.augment("ช้างมีทั้งหมด 50 ตัว บน")
+# output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้',
+ 'ช้างมีทั้งหมด 50 ตัว บนสุด',
+ 'ช้างมีทั้งหมด 50 ตัว บนบก',
+ 'ช้างมีทั้งหมด 50 ตัว บนนั้น',
+ 'ช้างมีทั้งหมด 50 ตัว บนหัว']
+
+
+
+ +
+ +
+
+

BPEmbAug Class

+

The pythainlp.augment.word2vec.bpemb_wv package contains the BPEmbAug class, which is designed for text augmentation using subword embeddings. This class is particularly useful when working with subword representations for Thai text augmentation.

+
+
+class pythainlp.augment.word2vec.bpemb_wv.BPEmbAug(lang: str = 'th', vs: int = 100000, dim: int = 300)[source]
+

Thai Text Augment using word2vec from BPEmb

+

BPEmb: +github.com/bheinzerling/bpemb

+
+
+__init__(lang: str = 'th', vs: int = 100000, dim: int = 300)[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load BPEmb model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from BPEmb

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug
+
+aug = BPEmbAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: ['ผมสอน', 'ผมเข้าเรียน']
+
+
+
+ +
+ +
+
+

Additional Functions

+

To further enhance your text augmentation tasks, the pythainlp.augment module offers the following functions:

+
    +
  • postype2wordnet: This function maps part-of-speech tags to WordNet-compatible POS tags, facilitating the integration of WordNet augmentation with Thai text.

  • +
+

These functions and classes provide diverse techniques for text augmentation in the Thai language, making this module a valuable asset for NLP researchers, developers, and practitioners.

+

For detailed usage examples and guidelines, please refer to the official PyThaiNLP documentation. The pythainlp.augment module opens up new possibilities for enriching and diversifying Thai text data, leading to improved NLP models and applications.

+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/.ipynb_checkpoints/transliterate-checkpoint.html b/5.1/api/.ipynb_checkpoints/transliterate-checkpoint.html new file mode 100644 index 0000000..269002d --- /dev/null +++ b/5.1/api/.ipynb_checkpoints/transliterate-checkpoint.html @@ -0,0 +1,538 @@ + + + + + + + + + pythainlp.transliterate — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.transliterate

+

The pythainlp.transliterate module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks.

+
+

Modules

+
+
+pythainlp.transliterate.romanize(text: str, engine: str = 'royin', fallback_engine: str = 'royin') str[source]
+

This function renders Thai words in the Latin alphabet or “romanization”, +using the Royal Thai General System of Transcription (RTGS) +[1]. RTGS is the official system published +by the Royal Institute of Thailand. (Thai: ถอดเสียงภาษาไทยเป็นอักษรละติน)

+
+
Parameters:
+
    +
  • text (str) – Thai text to be romanized

  • +
  • engine (str) – One of ‘royin’ (default), ‘thai2rom’, ‘thai2rom_onnx, ‘tltk’, and ‘lookup’. See more in options for engine section.

  • +
  • fallback_engine (str) – If engine equals ‘lookup’, use fallback_engine for words that are not in the transliteration dict. +No effect on other engines. Default to ‘royin’.

  • +
+
+
Returns:
+

A string of Thai words rendered in the Latin alphabet.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • royin - (default) based on the Royal Thai General System of +Transcription issued by Royal Institute of Thailand.

  • +
  • thai2rom - a deep learning-based Thai romanization engine +(require PyTorch).

  • +
  • thai2rom_onnx - a deep learning-based Thai romanization engine with ONNX runtime

  • +
  • tltk - TLTK: Thai Language Toolkit

  • +
  • lookup - Look up on Thai-English Transliteration dictionary v1.4 compiled by Wannaphong.

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import romanize
+
+romanize("สามารถ", engine="royin")
+# output: 'samant'
+
+romanize("สามารถ", engine="thai2rom")
+# output: 'samat'
+
+romanize("สามารถ", engine="tltk")
+# output: 'samat'
+
+romanize("ภาพยนตร์", engine="royin")
+# output: 'phapn'
+
+romanize("ภาพยนตร์", engine="thai2rom")
+# output: 'phapphayon'
+
+romanize("ภาพยนตร์", engine="thai2rom_onnx")
+# output: 'phapphayon'
+
+romanize("ก็อปปี้", engine="lookup")
+# output: 'copy'
+
+
+

The romanize function allows you to transliterate Thai text, converting it into a phonetic representation using the English alphabet. It’s a fundamental tool for rendering Thai words and phrases in a more familiar format.

+
+ +
+
+pythainlp.transliterate.transliterate(text: str, engine: str = 'thaig2p') str[source]
+

This function transliterates Thai text.

+
+
Parameters:
+
    +
  • text (str) – Thai text to be transliterated

  • +
  • engine (str) – ‘icu’, ‘ipa’, or ‘thaig2p’ (default)

  • +
+
+
Returns:
+

A string of phonetic alphabets indicating +how the input text should be pronounced.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • thaig2p - (default) Thai Grapheme-to-Phoneme, +output is IPA (require PyTorch)

  • +
  • icu - pyicu, based on International Components for Unicode (ICU)

  • +
  • ipa - epitran, output is International Phonetic Alphabet (IPA)

  • +
  • tltk_g2p - Thai Grapheme-to-Phoneme from TLTK.,

  • +
  • iso_11940 - Thai text into Latin characters with ISO 11940.

  • +
  • tltk_ipa - tltk, output is International Phonetic Alphabet (IPA)

  • +
  • thaig2p_v2 - Thai Grapheme-to-Phoneme, +output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import transliterate
+
+transliterate("สามารถ", engine="icu")
+# output: 's̄āmārt̄h'
+
+transliterate("สามารถ", engine="ipa")
+# output: 'saːmaːrot'
+
+transliterate("สามารถ", engine="thaig2p")
+# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
+
+transliterate("สามารถ", engine="tltk_ipa")
+# output: 'saː5.maːt3'
+
+transliterate("สามารถ", engine="tltk_g2p")
+# output: 'saa4~maat2'
+
+transliterate("สามารถ", engine="iso_11940")
+# output: 's̄āmārt̄h'
+
+transliterate("ภาพยนตร์", engine="icu")
+# output: 'p̣hāphyntr̒'
+
+transliterate("ภาพยนตร์", engine="ipa")
+# output: 'pʰaːpjanot'
+
+transliterate("ภาพยนตร์", engine="thaig2p")
+# output: 'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
+
+transliterate("ภาพยนตร์", engine="iso_11940")
+# output: 'p̣hāphyntr'
+
+
+

The transliterate function serves as a versatile transliteration tool, offering a range of transliteration engines to choose from. It provides flexibility and customization for your transliteration needs.

+
+ +
+
+pythainlp.transliterate.pronunciate(word: str, engine: str = 'w2p') str[source]
+

This function pronunciates Thai word.

+
+
Parameters:
+
    +
  • word (str) – Thai text to be pronunciated

  • +
  • engine (str) – ‘w2p’ (default)

  • +
+
+
Returns:
+

A string of Thai letters indicating +how the input text should be pronounced.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • w2p - Thai Word-to-Phoneme

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import pronunciate
+
+pronunciate("สามารถ", engine="w2p")
+# output: 'สา-มาด'
+
+pronunciate("ภาพยนตร์", engine="w2p")
+# output: 'พาบ-พะ-ยน'
+
+
+

This function provides assistance in generating phonetic representations of Thai words, which is particularly useful for language learning and pronunciation practice.

+
+ +
+
+pythainlp.transliterate.puan(word: str, show_pronunciation: bool = True) str[source]
+

Thai Spoonerism

+

This function converts Thai word to spoonerism word.

+
+
Parameters:
+
    +
  • word (str) – Thai word to be spoonerized

  • +
  • show_pronunciation (bool) – True (default) or False

  • +
+
+
Returns:
+

A string of Thai spoonerism word.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.transliterate import puan
+
+puan("นาริน")
+# output: 'นิน-รา'
+
+puan("นาริน", False)
+# output: 'นินรา'
+
+
+

The puan function offers a unique transliteration feature known as “Puan.” It provides a specialized transliteration method for Thai text and is an additional option for rendering Thai text into English characters.

+
+ +
+
+class pythainlp.transliterate.wunsen.WunsenTransliterate[source]
+

Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text +to Thai text +by Wunsen

+
+
See Also:
+
+
+
+

The WunsenTransliterate class represents a transliteration engine known as “Wunsen.” It offers specific transliteration methods for rendering Thai text into a phonetic English format.

+
+
+__init__() None[source]
+
+ +
+
+transliterate(text: str, lang: str, jp_input: str | None = None, zh_sandhi: bool | None = None, system: str | None = None)[source]
+

Use Wunsen for transliteration

+
+
Parameters:
+
    +
  • text (str) – text to be transliterated to Thai text.

  • +
  • lang (str) – source language

  • +
  • jp_input (str) – Japanese input method (for Japanese only)

  • +
  • zh_sandhi (bool) – Mandarin third tone sandhi option +(for Mandarin only)

  • +
  • system (str) – transliteration system (for Japanese and +Mandarin only)

  • +
+
+
Returns:
+

Thai text

+
+
Return type:
+

str

+
+
Options for lang:
+
    +
  • jp - Japanese (from Hepburn romanization)

  • +
  • ko - Korean (from Revised Romanization)

  • +
  • vi - Vietnamese (Latin script)

  • +
  • zh - Mandarin (from Hanyu Pinyin)

  • +
+
+
Options for jp_input:
+
    +
  • Hepburn-no diacritic - Hepburn-no diacritic (without macron)

  • +
+
+
Options for zh_sandhi:
+
    +
  • True - apply third tone sandhi rule

  • +
  • False - do not apply third tone sandhi rule

  • +
+
+
Options for system:
+
    +
  • +
    ORS61 - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น

    (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)

    +
    +
    +
  • +
  • +
    RI35 - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น

    (ราชบัณฑิตยสถาน พ.ศ. 2535)

    +
    +
    +
  • +
  • +
    RI49 - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน

    (ราชบัณฑิตยสถาน พ.ศ. 2549)

    +
    +
    +
  • +
  • +
    THC43 - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน

    ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร +ภาษาจีน พ.ศ. 2543)

    +
    +
    +
  • +
+
+
Example:
+

+
+
from pythainlp.transliterate.wunsen import WunsenTransliterate
+
+wt = WunsenTransliterate()
+
+wt.transliterate("ohayō", lang="jp")
+# output: 'โอฮาโย'
+
+wt.transliterate(
+    "ohayou",
+    lang="jp",
+    jp_input="Hepburn-no diacritic"
+)
+# output: 'โอฮาโย'
+
+wt.transliterate("ohayō", lang="jp", system="RI35")
+# output: 'โอะฮะโย'
+
+wt.transliterate("annyeonghaseyo", lang="ko")
+# output: 'อันนย็องฮาเซโย'
+
+wt.transliterate("xin chào", lang="vi")
+# output: 'ซีน จ่าว'
+
+wt.transliterate("ni3 hao3", lang="zh")
+# output: 'หนี เห่า'
+
+wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
+# output: 'หนี่ เห่า'
+
+wt.transliterate("ni3 hao3", lang="zh", system="RI49")
+# output: 'หนี ห่าว'
+
+
+
+ +
+ +
+
+

Transliteration Engines

+

thai2rom

+
+
+pythainlp.transliterate.thai2rom.romanize(text: str) str[source]
+

Romanize Thai text

+
+
Parameters:
+

text (str) – Thai text to be romanized

+
+
Returns:
+

Roman characters representing the pronunciation of the Thai text

+
+
Return type:
+

str

+
+
+

The thai2rom engine specializes in transliterating Thai text into romanized form. It’s particularly useful for rendering Thai words accurately in an English phonetic format.

+
+ +

royin

+
+
+pythainlp.transliterate.royin.romanize(text: str) str[source]
+

Render Thai words in Latin alphabet, using RTGS

+

Royal Thai General System of Transcription (RTGS), +is the official system by the Royal Institute of Thailand.

+
+
Parameters:
+

text (str) – Thai text to be romanized

+
+
Returns:
+

A string of Thai words rendered in the Latin alphabet

+
+
Return type:
+

str

+
+
+

The royin engine focuses on transliterating Thai text into English characters. It provides an alternative approach to transliteration, ensuring accurate representation of Thai words.

+
+ +

Transliterate Engines

+

This section includes multiple transliteration engines designed to suit various use cases. They offer unique methods for transliterating Thai text into romanized form:

+
    +
  • icu: Utilizes the ICU transliteration system for phonetic conversion.

  • +
  • ipa: Provides International Phonetic Alphabet (IPA) representation of Thai text.

  • +
  • thaig2p: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.

  • +
  • thaig2p_v2: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0

  • +
  • tltk: Utilizes the TLTK transliteration system for a specific approach to transliteration.

  • +
  • iso_11940: Focuses on the ISO 11940 transliteration standard.

  • +
+
+
+

References

+ +

The pythainlp.transliterate module offers a comprehensive set of tools and engines for transliterating Thai text into Romanized form. Whether you need a simple transliteration, specific engines for accurate representation, or phonetic rendering, this module provides a wide range of options. Additionally, the module references a publication that highlights the significance of Romanization, Transliteration, and Transcription in making the Thai language accessible to a global audience.

+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/.ipynb_checkpoints/word_vector-checkpoint.html b/5.1/api/.ipynb_checkpoints/word_vector-checkpoint.html new file mode 100644 index 0000000..4886b38 --- /dev/null +++ b/5.1/api/.ipynb_checkpoints/word_vector-checkpoint.html @@ -0,0 +1,487 @@ + + + + + + + + + pythainlp.word_vector — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.word_vector

+

The word_vector contains functions that makes use of a pre-trained vector public data. +The pythainlp.word_vector module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more.

+
+

Dependencies

+

Installation of numpy and gensim is required.

+

Before using this module, you need to ensure that the numpy and gensim libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors.

+
+
+

Modules

+
+
+class pythainlp.word_vector.WordVector(model_name: str = 'thai2fit_wv')[source]
+

Word Vector class

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+
Options for model_name
    +
  • thai2fit_wv (default) - word vector from thai2fit

  • +
  • ltw2v - word vector from LTW2V: The Large Thai Word2Vec v0.1

  • +
  • ltw2v_v1.0_15_window - word vector from LTW2V v1.0 and 15 window

  • +
  • ltw2v_v1.0_5_window - word vector from LTW2V v1.0 and 5 window

  • +
+
+
+

The WordVector class encapsulates word vector operations and functions. It provides a convenient interface for loading models, finding word similarities, and generating sentence vectors.

+
+
+__init__(model_name: str = 'thai2fit_wv') None[source]
+

Word Vector class

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+
Options for model_name
    +
  • thai2fit_wv (default) - word vector from thai2fit

  • +
  • ltw2v - word vector from LTW2V: The Large Thai Word2Vec

  • +
  • ltw2v_v1.0_15_window - word2vec from LTW2V 1.0 and 15 window

  • +
  • ltw2v_v1.0_5_window - word2vec from LTW2V v1.0 and 5 window

  • +
+
+
+
+ +
+
+load_wordvector(model_name: str)[source]
+

Load word vector model.

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+ +
+
+get_model() KeyedVectors[source]
+

Get word vector model.

+
+
Returns:
+

gensim word2vec model

+
+
Return type:
+

gensim.models.keyedvectors.Word2VecKeyedVectors

+
+
+
+ +
+
+doesnt_match(words: List[str]) str[source]
+

This function returns one word that is mostly unrelated to other words +in the list. We use the function doesnt_match() +from gensim.

+
+
Parameters:
+

words (list) – a list of words

+
+
Raises:
+

KeyError – if there is any word in positive or negative that is +not in the vocabulary of the model.

+
+
Returns:
+

the word is that mostly unrelated

+
+
Return type:
+

str

+
+
Note:
+
    +
  • If a word in words is not in the vocabulary, KeyError +will be raised.

  • +
+
+
Example:
+

+
+

Pick the word “พริกไทย” (name of food) out of the list of meals +(“อาหารเช้า”, “อาหารเที่ยง”, “อาหารเย็น”). +>>> from pythainlp.word_vector import WordVector +>>> +>>> wv = WordVector() +>>> words = [‘อาหารเช้า’, ‘อาหารเที่ยง’, ‘อาหารเย็น’, ‘พริกไทย’] +>>> wv.doesnt_match(words) +พริกไทย

+

Pick the word “เรือ” (name of vehicle) out of the list of words +related to occupation (“ดีไซน์เนอร์”, “พนักงานเงินเดือน”, “หมอ”).

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> words = ['ดีไซน์เนอร์', 'พนักงานเงินเดือน', 'หมอ', 'เรือ']
+>>> wv.doesnt_match(words)
+เรือ
+
+
+
+ +
+
+most_similar_cosmul(positive: List[str], negative: List[str]) List[Tuple[str, float]][source]
+

This function finds the top-10 words that are most similar with respect +to two lists of words labeled as positive and negative. +The top-10 most similar words are obtained using multiplication +combination objective from Omer Levy and Yoav Goldberg +[OmerLevy_YoavGoldberg_2014].

+

We use the function gensim.most_similar_cosmul() directly from +gensim.

+
+
Parameters:
+
    +
  • positive (list) – a list of words to add

  • +
  • negative (list) – a list of words to subtract

  • +
+
+
Raises:
+

KeyError – if there is any word in positive or negative that is +not in the vocabulary of the model.

+
+
Returns:
+

list of top-10 most similar words and its similarity score

+
+
Return type:
+

list[tuple[str, float]]

+
+
Note:
+
    +
  • With a single word in the positive list, it will find the +most similar words to the word given (similar +to gensim.most_similar())

  • +
  • If a word in positive or negative is not in the vocabulary, +KeyError will be raised.

  • +
+
+
Example:
+

+
+

Find the top-10 most similar words to the word: “แม่น้ำ”.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['แม่น้ำ']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ลำน้ำ', 0.8206598162651062), ('ทะเลสาบ', 0.775945782661438),
+('ลุ่มน้ำ', 0.7490593194961548), ('คลอง', 0.7471904754638672),
+('ปากแม่น้ำ', 0.7354257106781006), ('ฝั่งแม่น้ำ', 0.7120099067687988),
+('ทะเล', 0.7030453681945801), ('ริมแม่น้ำ', 0.7015200257301331),
+('แหล่งน้ำ', 0.6997432112693787), ('ภูเขา', 0.6960948705673218)]
+
+
+

Find the top-10 most similar words to the words: “นายก”, +“รัฐมนตรี”, and “ประเทศ”.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['นายก', 'รัฐมนตรี', 'ประเทศ']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('รองนายกรัฐมนตรี', 0.2730445861816406),
+('เอกอัครราชทูต', 0.26500266790390015),
+('นายกรัฐมนตรี', 0.2649088203907013),
+('ผู้ว่าราชการจังหวัด', 0.25119125843048096),
+('ผู้ว่าการ', 0.2510434687137604), ('เลขาธิการ', 0.24824175238609314),
+('ผู้ว่า', 0.2453523576259613), ('ประธานกรรมการ', 0.24147476255893707),
+('รองประธาน', 0.24123257398605347), ('สมาชิกวุฒิสภา',
+0.2405330240726471)]
+
+
+

Find the top-10 most similar words when having only positive +list and both positive and negative lists.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ประเทศจีน', 0.22022421658039093), ('เกาหลี', 0.2196873426437378),
+('สหรัฐอเมริกา', 0.21660110354423523),
+('ประเทศญี่ปุ่น', 0.21205860376358032),
+('ประเทศไทย', 0.21159221231937408), ('เกาหลีใต้',
+0.20321202278137207),
+('อังกฤษ', 0.19610872864723206), ('ฮ่องกง', 0.1928885132074356),
+('ฝรั่งเศส', 0.18383873999118805), ('พม่า', 0.18369348347187042)]
+>>>
+>>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น']
+>>> list_negative = ['อเมริกา']
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ประเทศไทย', 0.3278159201145172), ('เกาหลี', 0.3201899230480194),
+('ประเทศจีน', 0.31755179166793823), ('พม่า', 0.30845439434051514),
+('ประเทศญี่ปุ่น', 0.306713730096817),
+('เกาหลีใต้', 0.3003999888896942),
+('ลาว', 0.2995176911354065), ('คนไทย', 0.2885020673274994),
+('เวียดนาม', 0.2878379821777344), ('ชาวไทย', 0.28480708599090576)]
+
+
+

The function returns KeyError when the term “เมนูอาหารไทย” +is not in the vocabulary.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['เมนูอาหารไทย']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+KeyError: "word 'เมนูอาหารไทย' not in vocabulary"
+
+
+
+ +
+
+similarity(word1: str, word2: str) float[source]
+

This function computes cosine similarity between two words.

+
+
Parameters:
+
    +
  • word1 (str) – first word to be compared with

  • +
  • word2 (str) – second word to be compared with

  • +
+
+
Raises:
+

KeyError – if either word1 or word2 is not in the +vocabulary of the model.

+
+
Returns:
+

the cosine similarity between the two word vectors

+
+
Return type:
+

float

+
+
Note:
+
    +
  • If a word in word1 or word2 is not in the vocabulary, +KeyError will be raised.

  • +
+
+
Example:
+

+
+

Compute consine similarity between two words: “รถไฟ” and “รถไฟฟ้า” +(train and electric train).

+
>>> from pythainlp.word_vector import WordVector
+>>> wv = WordVector()
+>>> wv.similarity('รถไฟ', 'รถไฟฟ้า')
+0.43387136
+
+
+

Compute consine similarity between two words: “เสือดาว” and “รถไฟฟ้า” +(leopard and electric train).

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> wv.similarity('เสือดาว', 'รถไฟฟ้า')
+0.04300258
+
+
+
+ +
+
+sentence_vectorizer(text: str, use_mean: bool = True) ndarray[source]
+

This function converts a Thai sentence into vector. +Specifically, it first tokenizes that text and map each tokenized word +with the word vectors from the model. +Then, word vectors are aggregated into one vector of 300 dimension +by calculating either mean or summation of all word vectors.

+
+
Parameters:
+
    +
  • text (str) – text input

  • +
  • use_mean (bool) – if True aggregate word vectors with mean of all +word vectors. Otherwise, aggregate with +summation of all word vectors

  • +
+
+
Returns:
+

300-dimension vector representing the given sentence +in form of numpy array

+
+
Return type:
+

numpy.ndarray((1,300))

+
+
Example:
+

+
+

Vectorize the sentence, “อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735”, +into one sentence vector with two aggregation methods: mean +and summation.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> sentence = 'อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735'
+>>> wv.sentence_vectorizer(sentence, use_mean=True)
+array([[-0.00421414, -0.08881307,  0.05081136, -0.05632929,
+     -0.06607185, 0.03059357, -0.113882  , -0.00074836,  0.05035743,
+     0.02914307,
+     ...
+    0.02893357,  0.11327957,  0.04562086, -0.05015393,  0.11641257,
+    0.32304936, -0.05054322,  0.03639471, -0.06531371,  0.05048079]])
+>>>
+>>> wv.sentence_vectorizer(sentence, use_mean=False)
+array([[-0.05899798, -1.24338295,  0.711359  , -0.78861002,
+     -0.92500597, 0.42831   , -1.59434797, -0.01047703,  0.705004
+    ,  0.40800299,
+    ...
+    0.40506999,  1.58591403,  0.63869202, -0.702155  ,  1.62977601,
+    4.52269109, -0.70760502,  0.50952601, -0.914392  ,  0.70673105]])
+
+
+
+ +
+ +
+
+

References

+
    +
  • [Omer Levy and Yoav Goldberg (2014). Linguistic Regularities in Sparse and Explicit Word Representations](https://www.aclweb.org/anthology/W14-1618/) +This reference points to the work by Omer Levy and Yoav Goldberg, which discusses linguistic regularities in word representations. It underlines the theoretical foundation of word vectors and their applications in NLP.

  • +
+

This enhanced documentation provides a more detailed and organized overview of the pythainlp.word_vector module, making it a valuable resource for NLP practitioners and researchers working with pre-trained word vectors in the Thai language.

+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/ancient.html b/5.1/api/ancient.html new file mode 100644 index 0000000..92140ef --- /dev/null +++ b/5.1/api/ancient.html @@ -0,0 +1,188 @@ + + + + + + + + + pythainlp.ancient — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.ancient

+
+

Modules

+
+
+pythainlp.ancient.aksonhan_to_current(word: str) str[source]
+

Convert AksonHan words to current Thai words

+

AksonHan (อักษรหัน) writes down two consonants for the spelling of the /a/ vowels. (สระ อะ).

+

Today, รร is an aksonHan word that is still used in Thai.

+
+
Parameters:
+

word (str) – Thai word

+
+
Returns:
+

Thai AksonHan to be converted to current Thai word

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.ancient import aksonhan_to_current
+
+print(aksonhan_to_current("จกก"))
+# output: จัก
+
+print(aksonhan_to_current("บงงคบบ"))
+# output: บังคับ
+
+print(aksonhan_to_current("สรรเพชญ")) # รร is still used.
+# output: สรรเพชญ
+
+
+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/augment.html b/5.1/api/augment.html new file mode 100644 index 0000000..a76bcec --- /dev/null +++ b/5.1/api/augment.html @@ -0,0 +1,673 @@ + + + + + + + + + pythainlp.augment — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.augment

+
+

Introduction

+

The pythainlp.augment module is a powerful toolset for text augmentation in the Thai language. Text augmentation is a process that enriches and diversifies textual data by generating alternative versions of the original text. This module is a valuable resource for improving the quality and variety of Thai language data for NLP tasks.

+
+
+

WordNetAug Class

+

The WordNetAug class is designed to perform text augmentation using WordNet, a lexical database for English. This class enables you to augment Thai text using English synonyms, offering a unique approach to text diversification. The following methods are available within this class:

+
+
+class pythainlp.augment.WordNetAug[source]
+

Text Augment using wordnet

+
+
+__init__()[source]
+
+ +
+
+find_synonyms(word: str, pos: str | None = None, postag_corpus: str = 'orchid') List[str][source]
+

Find synonyms using wordnet

+
+
Parameters:
+
    +
  • word (str) – word

  • +
  • pos (str) – part-of-speech type

  • +
  • postag_corpus (str) – name of POS tag corpus

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+augment(sentence: str, tokenize: object = <function word_tokenize>, max_syn_sent: int = 6, postag: bool = True, postag_corpus: str = 'orchid') List[List[str]][source]
+

Text Augment using wordnet

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • tokenize (object) – function for tokenizing words

  • +
  • max_syn_sent (int) – maximum number of synonymous sentences

  • +
  • postag (bool) – use part-of-speech

  • +
  • postag_corpus (str) – name of POS tag corpus

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment import WordNetAug
+
+aug = WordNetAug()
+aug.augment("เราชอบไปโรงเรียน")
+# output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'),
+ ('เรา', 'ชอบ', 'ไป', 'รร.'),
+ ('เรา', 'ชอบ', 'ไป', 'โรงเรียน'),
+ ('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'),
+ ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
+ ('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
+
+
+
+ +
+ +
+
+

Word2VecAug, Thai2fitAug, LTW2VAug Classes

+

The pythainlp.augment.word2vec package contains multiple classes for text augmentation using Word2Vec models. These classes include Word2VecAug, Thai2fitAug, and LTW2VAug. Each of these classes allows you to use Word2Vec embeddings to generate text variations. Explore the methods provided by these classes to understand their capabilities.

+
+
+class pythainlp.augment.word2vec.Word2VecAug(model: str, tokenize: object, type: str = 'file')[source]
+
+
+__init__(model: str, tokenize: object, type: str = 'file') None[source]
+
+
Parameters:
+
    +
  • model (str) – path of model

  • +
  • tokenize (object) – tokenize function

  • +
  • type (str) – model type (file, binary)

  • +
+
+
+
+ +
+
+modify_sent(sent: str, p: float = 0.7) List[List[str]][source]
+
+
Parameters:
+
    +
  • sent (str) – text of sentence

  • +
  • p (float) – probability

  • +
+
+
Return type:
+

List[List[str]]

+
+
+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+
+
Parameters:
+
    +
  • sentence (str) – text of sentence

  • +
  • n_sent (int) – maximum number of synonymous sentences

  • +
  • p (int) – probability

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
+
+ +
+ +
+
+class pythainlp.augment.word2vec.Thai2fitAug[source]
+

Text Augment using word2vec from Thai2Fit

+

Thai2Fit: +github.com/cstorm125/thai2fit

+
+
+__init__()[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load Thai2Fit’s word2vec model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from Thai2Fit

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of text augmented

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec import Thai2fitAug
+
+aug = Thai2fitAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: [('พวกเรา', 'เรียน'), ('ฉัน', 'เรียน')]
+
+
+
+ +
+ +
+
+class pythainlp.augment.word2vec.LTW2VAug[source]
+

Text Augment using word2vec from LTW2V

+

LTW2V: +github.com/PyThaiNLP/large-thaiword2vec

+
+
+__init__()[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load LTW2V’s word2vec model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from Thai2Fit

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of text augmented

+
+
Return type:
+

List[Tuple[str]]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec import LTW2VAug
+
+aug = LTW2VAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: [('เขา', 'เรียนหนังสือ'), ('เขา', 'สมัครเรียน')]
+
+
+
+ +
+ +
+
+

FastTextAug and Thai2transformersAug Classes

+

The pythainlp.augment.lm package offers classes for text augmentation using language models. These classes include FastTextAug and Thai2transformersAug. These classes allow you to use language model-based techniques to diversify text data. Explore their methods to understand their capabilities.

+
+
+class pythainlp.augment.lm.FastTextAug(model_path: str)[source]
+

Text Augment from fastText

+
+
Parameters:
+

model_path (str) – path of model file

+
+
+
+
+__init__(model_path: str)[source]
+
+
Parameters:
+

model_path (str) – path of model file

+
+
+
+ +
+
+tokenize(text: str) List[str][source]
+

Thai text tokenization for fastText

+
+
Parameters:
+

text (str) – Thai text

+
+
Returns:
+

list of words

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+modify_sent(sent: str, p: float = 0.7) List[List[str]][source]
+
+
Parameters:
+
    +
  • sent (str) – text of sentence

  • +
  • p (float) – probability

  • +
+
+
Return type:
+

List[List[str]]

+
+
+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment from fastText

+

You may want to download the Thai model +from https://fasttext.cc/docs/en/crawl-vectors.html.

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentences

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[Tuple[str]]

+
+
+
+ +
+ +
+
+class pythainlp.augment.lm.Thai2transformersAug[source]
+
+
+__init__()[source]
+
+ +
+
+generate(sentence: str, num_replace_tokens: int = 3)[source]
+
+ +
+
+augment(sentence: str, num_replace_tokens: int = 3) List[str][source]
+

Text augmentation from WangchanBERTa

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • num_replace_tokens (int) – number replace tokens

  • +
+
+
Returns:
+

list of text augment

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.augment.lm import Thai2transformersAug
+
+aug = Thai2transformersAug()
+
+aug.augment("ช้างมีทั้งหมด 50 ตัว บน")
+# output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้',
+ 'ช้างมีทั้งหมด 50 ตัว บนสุด',
+ 'ช้างมีทั้งหมด 50 ตัว บนบก',
+ 'ช้างมีทั้งหมด 50 ตัว บนนั้น',
+ 'ช้างมีทั้งหมด 50 ตัว บนหัว']
+
+
+
+ +
+ +
+
+

BPEmbAug Class

+

The pythainlp.augment.word2vec.bpemb_wv package contains the BPEmbAug class, which is designed for text augmentation using subword embeddings. This class is particularly useful when working with subword representations for Thai text augmentation.

+
+
+class pythainlp.augment.word2vec.bpemb_wv.BPEmbAug(lang: str = 'th', vs: int = 100000, dim: int = 300)[source]
+

Thai Text Augment using word2vec from BPEmb

+

BPEmb: +github.com/bheinzerling/bpemb

+
+
+__init__(lang: str = 'th', vs: int = 100000, dim: int = 300)[source]
+
+ +
+
+tokenizer(text: str) List[str][source]
+
+
Parameters:
+

text (str) – Thai text

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+load_w2v()[source]
+

Load BPEmb model

+
+ +
+
+augment(sentence: str, n_sent: int = 1, p: float = 0.7) List[Tuple[str]][source]
+

Text Augment using word2vec from BPEmb

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • n_sent (int) – number of sentence

  • +
  • p (float) – probability of word

  • +
+
+
Returns:
+

list of synonyms

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug
+
+aug = BPEmbAug()
+aug.augment("ผมเรียน", n_sent=2, p=0.5)
+# output: ['ผมสอน', 'ผมเข้าเรียน']
+
+
+
+ +
+ +
+
+

Additional Functions

+

To further enhance your text augmentation tasks, the pythainlp.augment module offers the following functions:

+
    +
  • postype2wordnet: This function maps part-of-speech tags to WordNet-compatible POS tags, facilitating the integration of WordNet augmentation with Thai text.

  • +
+

These functions and classes provide diverse techniques for text augmentation in the Thai language, making this module a valuable asset for NLP researchers, developers, and practitioners.

+

For detailed usage examples and guidelines, please refer to the official PyThaiNLP documentation. The pythainlp.augment module opens up new possibilities for enriching and diversifying Thai text data, leading to improved NLP models and applications.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/benchmarks.html b/5.1/api/benchmarks.html new file mode 100644 index 0000000..cc68b69 --- /dev/null +++ b/5.1/api/benchmarks.html @@ -0,0 +1,263 @@ + + + + + + + + + pythainlp.benchmarks — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.benchmarks

+
+

Introduction

+

The pythainlp.benchmarks module is a collection of utility functions designed for benchmarking tasks related to Thai Natural Language Processing (NLP). Currently, the module includes tools for word tokenization benchmarking. Please note that additional benchmarking tasks will be incorporated in the future.

+
+
+

Tokenization

+

Word tokenization is a fundamental task in NLP, and it plays a crucial role in various applications, such as text analysis and language processing. The pythainlp.benchmarks module offers a set of functions to assist in the benchmarking and evaluation of word tokenization methods.

+
+

Quality Evaluation

+

The quality of word tokenization can significantly impact the accuracy of downstream NLP tasks. To assess the quality of word tokenization, the module provides a qualitative evaluation using various metrics and techniques.

+
+../_images/evaluation.png + +
+

Qualitative evaluation of word tokenization.

+
+
+
+
+
+

Functions

+
+
+pythainlp.benchmarks.word_tokenization.compute_stats(ref_sample: str, raw_sample: str) dict[source]
+

Compute statistics for tokenization quality

+

These statistics include:

+
+
Character-Level:

True Positive, False Positive, True Negative, False Negative, Precision, Recall, and f1

+
+
Word-Level:

Precision, Recall, and f1

+
+
Other:
    +
  • Correct tokenization indicator: {0, 1} sequence indicating that the corresponding +word is tokenized correctly.

  • +
+
+
+
+
Parameters:
+
    +
  • ref_sample (str) – ground truth for samples

  • +
  • samples (str) – samples that we want to evaluate

  • +
+
+
Returns:
+

metrics at character- and word-level and indicators of correctly tokenized words

+
+
Return type:
+

dict[str, float | str]

+
+
+

This function is used to compute various statistics and metrics related to word tokenization. It allows you to assess the performance of different tokenization methods.

+
+ +
+
+pythainlp.benchmarks.word_tokenization.benchmark(ref_samples: List[str], samples: List[str]) DataFrame[source]
+

Performance benchmarking for samples.

+

Please see pythainlp.benchmarks.word_tokenization.compute_stats() for +the computed metrics.

+
+
Parameters:
+
    +
  • ref_samples (list[str]) – ground truth for samples

  • +
  • samples (list[str]) – samples that we want to evaluate

  • +
+
+
Returns:
+

dataframe with row x col = len(samples) x len(metrics)

+
+
Return type:
+

pandas.DataFrame

+
+
+

The benchmark function facilitates the benchmarking of word tokenization methods. It provides an organized framework for evaluating and comparing the effectiveness of different tokenization tools.

+
+ +
+
+pythainlp.benchmarks.word_tokenization.preprocessing(txt: str, remove_space: bool = True) str[source]
+

Clean up text before performing evaluation.

+
+
Parameters:
+
    +
  • text (str) – text to be preprocessed

  • +
  • remove_space (bool) – whether to remove white space

  • +
+
+
Returns:
+

preprocessed text

+
+
Return type:
+

str

+
+
+

Preprocessing is a crucial step in NLP tasks. The preprocessing function assists in preparing text data for tokenization, which is essential for accurate and consistent benchmarking.

+
+ +
+
+

Usage

+

To make use of these benchmarking functions, you can follow the provided examples and guidelines in the official PyThaiNLP documentation. These tools are invaluable for researchers, developers, and anyone interested in improving and evaluating Thai word tokenization methods.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/chat.html b/5.1/api/chat.html new file mode 100644 index 0000000..06cd40e --- /dev/null +++ b/5.1/api/chat.html @@ -0,0 +1,223 @@ + + + + + + + + + pythainlp.chat — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.chat

+
+
+class pythainlp.chat.ChatBotModel[source]
+
+
+__init__()[source]
+

Chat using AI generation

+
+ +
+
+reset_chat()[source]
+

Reset chat by cleaning history

+
+ +
+
+load_model(model_name: str = 'wangchanglm', return_dict: bool = True, load_in_8bit: bool = False, device: str = 'cuda', torch_dtype=torch.float16, offload_folder: str = './', low_cpu_mem_usage: bool = True)[source]
+

Load model

+
+
Parameters:
+
    +
  • model_name (str) – Model name (Now, we support wangchanglm only)

  • +
  • return_dict (bool) – return_dict

  • +
  • load_in_8bit (bool) – load model in 8bit

  • +
  • device (str) – device (cpu, cuda or other)

  • +
  • torch_dtype (torch_dtype) – torch_dtype

  • +
  • offload_folder (str) – offload folder

  • +
  • low_cpu_mem_usage (bool) – low cpu mem usage

  • +
+
+
+
+ +
+
+chat(text: str) str[source]
+

Chatbot

+
+
Parameters:
+

text (str) – text for asking chatbot with.

+
+
Returns:
+

answer from chatbot.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.chat import ChatBotModel
+import torch
+
+chatbot = ChatBotModel()
+chatbot.load_model(device="cpu",torch_dtype=torch.bfloat16)
+
+print(chatbot.chat("สวัสดี"))
+# output: ยินดีที่ได้รู้จัก
+
+print(chatbot.history)
+# output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')]
+
+
+
+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/classify.html b/5.1/api/classify.html new file mode 100644 index 0000000..21ba430 --- /dev/null +++ b/5.1/api/classify.html @@ -0,0 +1,236 @@ + + + + + + + + + pythainlp.classify — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.classify

+
+
+class pythainlp.classify.GzipModel(training_data: List[Tuple[str, str]] | None = None, model_path: str | None = None)[source]
+

This class is a re-implementation of +“Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors +(Jiang et al., Findings 2023)

+
+
Parameters:
+
    +
  • training_data (list) – list [(text_sample,label)]

  • +
  • model_path (str) – Path for loading model (if you saved the model)

  • +
+
+
+
+
+__init__(training_data: List[Tuple[str, str]] | None = None, model_path: str | None = None)[source]
+
+ +
+
+train()[source]
+
+ +
+
+predict(x1: str, k: int = 1) str[source]
+
+
Parameters:
+
    +
  • x1 (str) – the text that we want to predict label for.

  • +
  • k (str) – k

  • +
+
+
Returns:
+

label

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.classify import GzipModel
+
+training_data =  [
+    ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
+    ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"),
+    ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"),
+    ("ขับรถแย่มาก", "Negative"),
+    ("ดีนะครับ", "Positive"),
+    ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"),
+    ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"),
+    ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"),
+    ("นี่เป็นบทความหนึ่ง", "Neutral")
+]
+model = GzipModel(training_data)
+print(model.predict("ฉันดีใจ", k=1))
+# output: Positive
+
+
+
+ +
+
+save(path: str)[source]
+
+
Parameters:
+

path (str) – path for save model

+
+
+
+ +
+
+load(path: str)[source]
+
+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/coref.html b/5.1/api/coref.html new file mode 100644 index 0000000..c3b2f63 --- /dev/null +++ b/5.1/api/coref.html @@ -0,0 +1,223 @@ + + + + + + + + + pythainlp.coref — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.coref

+
+

Introduction

+

The pythainlp.coref module is dedicated to Coreference Resolution for the Thai language. Coreference resolution is a crucial task in natural language processing (NLP) that deals with identifying and linking expressions (such as pronouns) in a text to the entities or concepts they refer to. This module provides tools to tackle coreference resolution challenges in the context of the Thai language.

+
+
+

Coreference Resolution Function

+

The primary component of the pythainlp.coref module is the coreference_resolution function. This function is designed to analyze text and identify instances of coreference, helping NLP systems understand when different expressions in the text refer to the same entity. Here’s how you can use it:

+

The pythainlp.coref is Coreference Resolution for Thai.

+
+
+pythainlp.coref.coreference_resolution(texts: List[str], model_name: str = 'han-coref-v1.0', device: str = 'cpu')[source]
+

Coreference Resolution

+
+
Parameters:
+
    +
  • texts (List[str]) – list of texts to apply coreference resolution to

  • +
  • model_name (str) – coreference resolution model

  • +
  • device (str) – device for running coreference resolution model on (“cpu”, “cuda”, and others)

  • +
+
+
Returns:
+

List of texts with coreference resolution

+
+
Return type:
+

List[dict]

+
+
Options for model_name:
+
    +
  • han-coref-v1.0 - (default) Han-Coref: Thai coreference resolution by PyThaiNLP v1.0

  • +
+
+
Example:
+

+
+
from pythainlp.coref import coreference_resolution
+
+print(
+    coreference_resolution(
+        ["Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"]
+    )
+)
+# output:
+# [
+# {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
+# 'clusters_string': [['Bill Gates', 'ผม']],
+# 'clusters': [[(0, 10), (50, 52)]]}
+# ]
+
+
+
+ +
+
+

Usage

+

To use the coreference_resolution function effectively, follow these steps:

+
    +
  1. Import the coreference_resolution function from the pythainlp.coref module.

  2. +
  3. Pass the Thai text you want to analyze for coreferences as input to the function.

  4. +
  5. The function will process the text and return information about coreference relationships within the text.

  6. +
+

Example:

+
+
::

from pythainlp.coref import coreference_resolution

+

text = “นาย A มาจาก กรุงเทพ และเขา มีความรักต่อ บางกิจ ของเขา” +coreferences = coreference_resolution(text)

+

print(coreferences)

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/corpus.html b/5.1/api/corpus.html new file mode 100644 index 0000000..aadf646 --- /dev/null +++ b/5.1/api/corpus.html @@ -0,0 +1,1695 @@ + + + + + + + + + pythainlp.corpus — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.corpus

+

The pythainlp.corpus module provides access to various Thai language corpora and resources that come bundled with PyThaiNLP. These resources are essential for natural language processing tasks in the Thai language.

+
+

Modules

+
+

countries

+
+
+pythainlp.corpus.countries() FrozenSet[str][source]
+
+

Return a frozenset of country names in Thai such as “แคนาดา”, “โรมาเนีย”, +“แอลจีเรีย”, and “ลาว”.

+
+

(See: dev/pythainlp/corpus/countries_th.txt)

+
+
+
return:
+

frozenset containing country names in Thai

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

find_synonym

+
+
+

get_corpus

+
+
+pythainlp.corpus.get_corpus(filename: str, comments: bool = True) frozenset[source]
+

Read corpus data from file and return a frozenset.

+

Each line in the file will be a member of the set.

+

Whitespace stripped and empty values and duplicates removed.

+

If comments is False, any text at any position after the character +‘#’ in each line will be discarded.

+
+
Parameters:
+
    +
  • filename (str) – filename of the corpus to be read

  • +
  • comments (bool) – keep comments

  • +
+
+
Returns:
+

frozenset consisting of lines in the file

+
+
Return type:
+

frozenset

+
+
Example:
+

+
+
from pythainlp.corpus import get_corpus
+
+# input file (negations_th.txt):
+# แต่
+# ไม่
+
+get_corpus("negations_th.txt")
+# output:
+# frozenset({'แต่', 'ไม่'})
+
+# input file (ttc_freq.txt):
+# ตัวบท<tab>10
+# โดยนัยนี้<tab>1
+
+get_corpus("ttc_freq.txt")
+# output:
+# frozenset({'โดยนัยนี้\t1',
+#    'ตัวบท\t10',
+#     ...})
+
+# input file (icubrk_th.txt):
+# # Thai Dictionary for ICU BreakIterator
+# กก
+# กกขนาก
+
+get_corpus("icubrk_th.txt")
+# output:
+# frozenset({'กกขนาก',
+#     '# Thai Dictionary for ICU BreakIterator',
+#     'กก',
+#     ...})
+
+get_corpus("icubrk_th.txt", comments=False)
+# output:
+# frozenset({'กกขนาก',
+#     'กก',
+#     ...})
+
+
+
+ +
+
+

get_corpus_as_is

+
+
+pythainlp.corpus.get_corpus_as_is(filename: str) list[source]
+

Read corpus data from file, as it is, and return a list.

+

Each line in the file will be a member of the list.

+

No modifications in member values and their orders.

+

If strip or comment removal is needed, use get_corpus() instead.

+
+
Parameters:
+

filename (str) – filename of the corpus to be read

+
+
Returns:
+

list consisting of lines in the file

+
+
Return type:
+

list

+
+
Example:
+

+
+
from pythainlp.corpus import get_corpus
+
+# input file (negations_th.txt):
+# แต่
+# ไม่
+
+get_corpus_as_is("negations_th.txt")
+# output:
+# ['แต่', 'ไม่']
+
+
+
+ +
+
+

get_corpus_db

+
+
+pythainlp.corpus.get_corpus_db(url: str)[source]
+

Get corpus catalog from server.

+
+
Parameters:
+

url (str) – URL corpus catalog

+
+
+
+ +
+
+

get_corpus_db_detail

+
+
+pythainlp.corpus.get_corpus_db_detail(name: str, version: str = '') dict[source]
+

Get details about a corpus, using information from local catalog.

+
+
Parameters:
+

name (str) – name of corpus

+
+
Returns:
+

details about corpus

+
+
Return type:
+

dict

+
+
+
+ +
+
+

get_corpus_default_db

+
+
+pythainlp.corpus.get_corpus_default_db(name: str, version: str = '') str | None[source]
+

Get model path from default_db.json

+
+
Parameters:
+

name (str) – corpus name

+
+
Returns:
+

path to the corpus or None if the corpus doesn’t exist on the device

+
+
Return type:
+

str

+
+
+

If you want to edit default_db.json, you can edit pythainlp/corpus/default_db.json

+
+ +
+
+

get_corpus_path

+
+
+pythainlp.corpus.get_corpus_path(name: str, version: str = '', force: bool = False) str | None[source]
+

Get corpus path.

+
+
Parameters:
+
    +
  • name (str) – corpus name

  • +
  • version (str) – version

  • +
  • force (bool) – force downloading

  • +
+
+
Returns:
+

path to the corpus or None if the corpus doesn’t exist on the device

+
+
Return type:
+

str

+
+
Example:
+

+
+

(Please see the filename in +this file

+

If the corpus already exists:

+
from pythainlp.corpus import get_corpus_path
+
+print(get_corpus_path('ttc'))
+# output: /root/pythainlp-data/ttc_freq.txt
+
+
+

If the corpus has not been downloaded yet:

+
from pythainlp.corpus import download, get_corpus_path
+
+print(get_corpus_path('wiki_lm_lstm'))
+# output: None
+
+download('wiki_lm_lstm')
+# output:
+# Download: wiki_lm_lstm
+# wiki_lm_lstm 0.32
+# thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s]
+# /root/pythainlp-data/thwiki_model_lstm.pth
+
+print(get_corpus_path('wiki_lm_lstm'))
+# output: /root/pythainlp-data/thwiki_model_lstm.pth
+
+
+
+ +
+
+

download

+
+
+pythainlp.corpus.download(name: str, force: bool = False, url: str = '', version: str = '') bool[source]
+

Download corpus.

+

The available corpus names can be seen in this file: +https://pythainlp.org/pythainlp-corpus/db.json

+
+
Parameters:
+
    +
  • name (str) – corpus name

  • +
  • force (bool) – force downloading

  • +
  • url (str) – URL of the corpus catalog

  • +
  • version (str) – version of the corpus

  • +
+
+
Returns:
+

True if the corpus is found and successfully downloaded. +Otherwise, it returns False.

+
+
Return type:
+

bool

+
+
Example:
+

+
+
from pythainlp.corpus import download
+
+download("wiki_lm_lstm", force=True)
+# output:
+# Corpus: wiki_lm_lstm
+# - Downloading: wiki_lm_lstm 0.1
+# thwiki_lm.pth:  26%|██▌       | 114k/434k [00:00<00:00, 690kB/s]
+
+
+

By default, downloaded corpora and models will be saved in +$HOME/pythainlp-data/ +(e.g. /Users/bact/pythainlp-data/wiki_lm_lstm.pth).

+
+ +
+
+

remove

+
+
+pythainlp.corpus.remove(name: str) bool[source]
+

Remove corpus

+
+
Parameters:
+

name (str) – corpus name

+
+
Returns:
+

True if the corpus is found and successfully removed. +Otherwise, it returns False.

+
+
Return type:
+

bool

+
+
Example:
+

+
+
from pythainlp.corpus import remove, get_corpus_path, get_corpus
+
+print(remove("ttc"))
+# output: True
+
+print(get_corpus_path("ttc"))
+# output: None
+
+get_corpus("ttc")
+# output:
+# FileNotFoundError: [Errno 2] No such file or directory:
+# '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
+
+
+
+ +
+
+

provinces

+
+
+pythainlp.corpus.provinces(details: bool = False) FrozenSet[str] | List[dict][source]
+
+

Return a frozenset of Thailand province names in Thai such as “กระบี่”, +“กรุงเทพมหานคร”, “กาญจนบุรี”, and “อุบลราชธานี”.

+
+

(See: dev/pythainlp/corpus/thailand_provinces_th.txt)

+
+
+
param bool details:
+

return details of provinces or not

+
+
return:
+

frozenset containing province names of Thailand (if details is False) or list containing dict of province names and details such as [{‘name_th’: ‘นนทบุรี’, ‘abbr_th’: ‘นบ’, ‘name_en’: ‘Nonthaburi’, ‘abbr_en’: ‘NBI’}].

+
+
rtype:
+

frozenset or list

+
+
+
+
+ +
+
+

thai_dict

+
+
+pythainlp.corpus.thai_dict() dict[source]
+
+

Return Thai dictionary with definition from wiktionary.

+
+

(See: thai_dict)

+
+
+
return:
+

Thai words with part-of-speech type and definition

+
+
rtype:
+

dict

+
+
+
+
+ +
+
+

thai_stopwords

+
+
+pythainlp.corpus.thai_stopwords() FrozenSet[str][source]
+
+

Return a frozenset of Thai stopwords such as “มี”, “ไป”, “ไง”, “ขณะ”, +“การ”, and “ประการหนึ่ง”.

+
+
+
(See: dev/pythainlp/corpus/stopwords_th.txt)

We use stopword lists by thesis’s เพ็ญศิริ ลี้ตระกูล.

+
+
See Also:
+

+
+

เพ็ญศิริ ลี้ตระกูล . การเลือกประโยคสำคัญในการสรุปความภาษาไทยโดยใช้แบบจำลองแบบลำดับชั้น. กรุงเทพมหานคร : มหาวิทยาลัยธรรมศาสตร์; 2551.

+
+
return:
+

frozenset containing stopwords.

+
+
rtype:
+

frozenset

+
+
+
+
+
+ +
+
+

thai_words

+
+
+pythainlp.corpus.thai_words() FrozenSet[str][source]
+
+

Return a frozenset of Thai words such as “กติกา”, “กดดัน”, “พิษ”, +and “พิษภัย”.

+
+

(See: dev/pythainlp/corpus/words_th.txt)

+
+
+
return:
+

frozenset containing words in the Thai language.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

thai_wsd_dict

+
+
+pythainlp.corpus.thai_wsd_dict() dict[source]
+
+

Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.

+
+

(See: thai_dict)

+
+
+
return:
+

Thai words with part-of-speech type and definition

+
+
rtype:
+

dict

+
+
+
+
+ +
+
+

thai_orst_words

+
+
+pythainlp.corpus.thai_orst_words() FrozenSet[str][source]
+
+

Return a frozenset of Thai words from Royal Society of Thailand

+
+

(See: dev/pythainlp/corpus/thai_orst_words.txt)

+
+
+
return:
+

frozenset containing words in the Thai language.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

thai_synonyms

+
+
+pythainlp.corpus.thai_synonyms() dict[source]
+
+

Return Thai synonyms.

+
+

(See: thai_synonym)

+
+
+
return:
+

Thai words with part-of-speech type and synonym

+
+
rtype:
+

dict

+
+
+
+
+ +
+
+

thai_syllables

+
+
+pythainlp.corpus.thai_syllables() FrozenSet[str][source]
+
+

Return a frozenset of Thai syllables such as “กรอบ”, “ก็”, “๑”, “โมบ”, +“โมน”, “โม่ง”, “กา”, “ก่า”, and, “ก้า”.

+
+
+
(See: dev/pythainlp/corpus/syllables_th.txt)

We use the Thai syllable list from KUCut.

+
+
return:
+

frozenset containing syllables in the Thai language.

+
+
rtype:
+

frozenset

+
+
+
+
+
+ +
+
+

thai_negations

+
+
+pythainlp.corpus.thai_negations() FrozenSet[str][source]
+
+

Return a frozenset of Thai negation words including “ไม่” and “แต่”.

+
+

(See: dev/pythainlp/corpus/negations_th.txt)

+
+
+
return:
+

frozenset containing negations in the Thai language.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

thai_family_names

+
+
+pythainlp.corpus.thai_family_names() FrozenSet[str][source]
+
+

Return a frozenset of Thai family names

+
+

(See: dev/pythainlp/corpus/family_names_th.txt)

+
+
+
return:
+

frozenset containing Thai family names.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

thai_female_names

+
+
+pythainlp.corpus.thai_female_names() FrozenSet[str][source]
+
+

Return a frozenset of Thai female names

+
+

(See: dev/pythainlp/corpus/person_names_female_th.txt)

+
+
+
return:
+

frozenset containing Thai female names.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

thai_male_names

+
+
+pythainlp.corpus.thai_male_names() FrozenSet[str][source]
+
+

Return a frozenset of Thai male names

+
+

(See: dev/pythainlp/corpus/person_names_male_th.txt)

+
+
+
return:
+

frozenset containing Thai male names.

+
+
rtype:
+

frozenset

+
+
+
+
+ +
+
+

pythainlp.corpus.th_en_translit.get_transliteration_dict

+
+
+pythainlp.corpus.th_en_translit.get_transliteration_dict() defaultdict[source]
+

Get Thai to English transliteration dictionary.

+

The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.

+
+ +
+
+
+

ConceptNet

+

ConceptNet is an open, multilingual knowledge graph used for various natural language understanding tasks. For more information, refer to the ConceptNet documentation.

+
+

pythainlp.corpus.conceptnet.edges

+
+
+pythainlp.corpus.conceptnet.edges(word: str, lang: str = 'th')[source]
+

Get edges from ConceptNet API. +ConceptNet is a public semantic network, designed to help computers +understand the meanings of words that people use.

+

For example, the term “ConceptNet” is a “knowledge graph”, and +“knowledge graph” has “common sense knowledge” which is a part of +“artificial intelligence”. Also, “ConcepNet” is used for +“natural language understanding” which is a part of +“artificial intelligence”.

+
+
+
“ConceptNet” –is a–> “knowledge graph” –has–> “common sense” –a part of–> “artificial intelligence”
+
“ConceptNet” –used for–> “natural language understanding” –a part of–> “artificial intelligence”
+
+
+

With this illustration, it shows relationships (represented as Edge) +between the terms (represented as Node).

+

This function requires an internet connection to access the ConceptNet API. +Please use it considerately. It will timeout after 10 seconds.

+
+
Parameters:
+
    +
  • word (str) – word to be sent to ConceptNet API

  • +
  • lang (str) – abbreviation of language (i.e. th for Thai, en for +English, or ja for Japan). By default, it is th +(Thai).

  • +
+
+
Returns:
+

return edges of the given word according to the +ConceptNet network.

+
+
Return type:
+

list[dict]

+
+
Example:
+

+
+
from pythainlp.corpus.conceptnet import edges
+
+edges('hello', lang='en')
+# output:
+# [{
+#   '@id': '/a/[/r/IsA/,/c/en/hello/,/c/en/greeting/]',
+#   '@type': 'Edge',
+#   'dataset': '/d/conceptnet/4/en',
+#   'end': {'@id': '/c/en/greeting',
+#   '@type': 'Node',
+#   'label': 'greeting',
+#   'language': 'en',
+#   'term': '/c/en/greeting'},
+#   'license': 'cc:by/4.0',
+#   'rel': {'@id': '/r/IsA', '@type': 'Relation', 'label': 'IsA'},
+#   'sources': [
+#   {
+#   '@id': '/and/[/s/activity/omcs/vote/,/s/contributor/omcs/bmsacr/]',
+#   '@type': 'Source',
+#   'activity': '/s/activity/omcs/vote',
+#   'contributor': '/s/contributor/omcs/bmsacr'
+#   },
+#   {
+#     '@id': '/and/[/s/activity/omcs/vote/,/s/contributor/omcs/test/]',
+#     '@type': 'Source',
+#     'activity': '/s/activity/omcs/vote',
+#     'contributor': '/s/contributor/omcs/test'}
+#   ],
+#   'start': {'@id': '/c/en/hello',
+#   '@type': 'Node',
+#   'label': 'Hello',
+#   'language': 'en',
+#   'term': '/c/en/hello'},
+#   'surfaceText': '[[Hello]] is a kind of [[greeting]]',
+#   'weight': 3.4641016151377544
+# }, ...]
+
+edges('สวัสดี', lang='th')
+# output:
+# [{
+#  '@id': '/a/[/r/RelatedTo/,/c/th/สวัสดี/n/,/c/en/prosperity/]',
+#  '@type': 'Edge',
+#  'dataset': '/d/wiktionary/en',
+#  'end': {'@id': '/c/en/prosperity',
+#  '@type': 'Node',
+#  'label': 'prosperity',
+#  'language': 'en',
+#  'term': '/c/en/prosperity'},
+#  'license': 'cc:by-sa/4.0',
+#  'rel': {
+#      '@id': '/r/RelatedTo', '@type': 'Relation',
+#      'label': 'RelatedTo'},
+#  'sources': [{
+#  '@id': '/and/[/s/process/wikiparsec/2/,/s/resource/wiktionary/en/]',
+#  '@type': 'Source',
+#  'contributor': '/s/resource/wiktionary/en',
+#  'process': '/s/process/wikiparsec/2'}],
+#  'start': {'@id': '/c/th/สวัสดี/n',
+#  '@type': 'Node',
+#  'label': 'สวัสดี',
+#  'language': 'th',
+#  'sense_label': 'n',
+#  'term': '/c/th/สวัสดี'},
+#  'surfaceText': None,
+#  'weight': 1.0
+# }, ...]
+
+
+
+ +

TNC (Thai National Corpus) +—

+

The Thai National Corpus (TNC) is a collection of text data in the Thai language. This module provides access to word frequency data from the TNC corpus.

+
+
+

pythainlp.corpus.tnc.word_freqs

+
+
+pythainlp.corpus.tnc.word_freqs() List[Tuple[str, int]][source]
+
+

Get word frequency from Thai National Corpus (TNC)

+
+

(See: dev/pythainlp/corpus/tnc_freq.txt)

+
+
+
+ +
+
+

pythainlp.corpus.tnc.unigram_word_freqs

+
+
+pythainlp.corpus.tnc.unigram_word_freqs() dict[str, int][source]
+

Get unigram word frequency from Thai National Corpus (TNC)

+
+ +
+
+

pythainlp.corpus.tnc.bigram_word_freqs

+
+
+pythainlp.corpus.tnc.bigram_word_freqs() dict[Tuple[str, str], int][source]
+

Get bigram word frequency from Thai National Corpus (TNC)

+
+ +
+
+

pythainlp.corpus.tnc.trigram_word_freqs

+
+
+pythainlp.corpus.tnc.trigram_word_freqs() dict[Tuple[str, str, str], int][source]
+

Get trigram word frequency from Thai National Corpus (TNC)

+
+ +

TTC (Thai Textbook Corpus) +—

+

The Thai Textbook Corpus (TTC) is a collection of Thai language text data, primarily sourced from textbooks.

+
+
+

pythainlp.corpus.ttc.word_freqs

+
+
+pythainlp.corpus.ttc.word_freqs() List[Tuple[str, int]][source]
+
+

Get word frequency from Thai Textbook Corpus (TTC)

+
+

(See: dev/pythainlp/corpus/ttc_freq.txt)

+
+ +
+
+

pythainlp.corpus.ttc.unigram_word_freqs

+
+
+pythainlp.corpus.ttc.unigram_word_freqs() dict[str, int][source]
+

Get unigram word frequency from Thai Textbook Corpus (TTC)

+
+ +
+
+
+

OSCAR

+

OSCAR is a multilingual corpus that includes Thai text data. This module provides access to word frequency data from the OSCAR corpus.

+
+

pythainlp.corpus.oscar.word_freqs

+
+
+pythainlp.corpus.oscar.word_freqs() List[Tuple[str, int]][source]
+

Get word frequency from OSCAR Corpus (words tokenized using ICU)

+
+ +
+
+

pythainlp.corpus.oscar.unigram_word_freqs

+
+
+pythainlp.corpus.oscar.unigram_word_freqs() dict[str, int][source]
+

Get unigram word frequency from OSCAR Corpus (words tokenized using ICU)

+
+ +
+
+
+

Util

+

Utilities for working with the corpus data.

+
+

pythainlp.corpus.util.find_badwords

+
+
+pythainlp.corpus.util.find_badwords(tokenize: Callable[[str], List[str]], training_data: Iterable[Iterable[str]]) Set[str][source]
+

Find words that do not work well with the tokenize function +for the provided training_data.

+
+
Parameters:
+
    +
  • tokenize (Callable[[str], List[str]]) – a tokenize function

  • +
  • training_data (Iterable[Iterable[str]]) – tokenized text, to be used as a training set

  • +
+
+
Returns:
+

words that are considered to make tokenize perform badly

+
+
Return type:
+

Set[str]

+
+
+
+ +
+
+

pythainlp.corpus.util.revise_wordset

+
+
+pythainlp.corpus.util.revise_wordset(tokenize: Callable[[str], List[str]], orig_words: Iterable[str], training_data: Iterable[Iterable[str]]) Set[str][source]
+

Revise a set of words that could improve tokenization performance of +a dictionary-based tokenize function.

+

orig_words will be used as a base set for the dictionary. +Words that do not performed well with training_data will be removed. +The remaining words will be returned.

+
+
Parameters:
+
    +
  • tokenize (Callable[[str], List[str]]) – a tokenize function, can be any function that takes a string as input and returns a List[str]

  • +
  • orig_words (Iterable[str]) – words that used by the tokenize function, will be used as a base for revision

  • +
  • training_data (Iterable[Iterable[str]]) – tokenized text, to be used as a training set

  • +
+
+
Returns:
+

words that are considered to make tokenize perform badly

+
+
Return type:
+

Set[str]

+
+
Example::
+

+
+
from pythainlp.corpus import thai_words
+from pythainlp.corpus.util import revise_wordset
+from pythainlp.tokenize.longest import segment
+
+base_words = thai_words()
+more_words = {
+    "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง"
+}
+base_words = base_words.union(more_words)
+dict_trie = Trie(wordlist)
+
+tokenize = lambda text: segment(text, dict_trie)
+
+training_data = [
+    [str, str, str. ...],
+    [str, str, str, str, ...],
+    ...
+]
+
+revised_words = revise_wordset(tokenize, wordlist, training_data)
+
+
+
+ +
+
+

pythainlp.corpus.util.revise_newmm_default_wordset

+
+
+pythainlp.corpus.util.revise_newmm_default_wordset(training_data: Iterable[Iterable[str]]) Set[str][source]
+

Revise a set of word that could improve tokenization performance of +pythainlp.tokenize.newmm, a dictionary-based tokenizer and a default +tokenizer for PyThaiNLP.

+

Words from pythainlp.corpus.thai_words() will be used as a base set +for the dictionary. Words that do not performed well with training_data +will be removed. The remaining words will be returned.

+
+
Parameters:
+

training_data (Iterable[Iterable[str]]) – tokenized text, to be used as a training set

+
+
Returns:
+

words that are considered to make tokenize perform badly

+
+
Return type:
+

Set[str]

+
+
+
+ +
+
+
+

WordNet

+

PyThaiNLP API includes the WordNet module, which is an exact copy of NLTK’s WordNet API for the Thai language. WordNet is a lexical database for English and other languages.

+

For more details on WordNet, refer to the NLTK WordNet documentation.

+
+

pythainlp.corpus.wordnet.synsets

+
+
+pythainlp.corpus.wordnet.synsets(word: str, pos: str | None = None, lang: str = 'tha')[source]
+

This function returns the synonym set for all lemmas of the given word +with an optional argument to constrain the part of speech of the word.

+
+
Parameters:
+
    +
  • word (str) – word to find synsets of

  • +
  • pos (str) – constraint of the part of speech (i.e. n for Noun, v +for Verb, a for Adjective, s for Adjective +satellites, and r for Adverb)

  • +
  • lang (str) – abbreviation of language (i.e. eng, tha). +By default, it is tha

  • +
+
+
Returns:
+

Synset all lemmas of the word constrained with +the argument pos.

+
+
Return type:
+

list[Synset]

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import synsets
+>>>
+>>> synsets("ทำงาน")
+[Synset('function.v.01'), Synset('work.v.02'),
+ Synset('work.v.01'), Synset('work.v.08')]
+>>>
+>>> synsets("บ้าน", lang="tha"))
+[Synset('duplex_house.n.01'), Synset('dwelling.n.01'),
+ Synset('house.n.01'), Synset('family.n.01'), Synset('home.n.03'),
+ Synset('base.n.14'), Synset('home.n.01'),
+ Synset('houseful.n.01'), Synset('home.n.07')]
+
+
+

When specifying the constraint of the part of speech. For example, +the word “แรง” could be interpreted as force (n.) or hard (adj.).

+
>>> from pythainlp.corpus.wordnet import synsets
+>>> # By default, allow all parts of speech
+>>> synsets("แรง", lang="tha")
+>>>
+>>> # only Noun
+>>> synsets("แรง", pos="n", lang="tha")
+[Synset('force.n.03'), Synset('force.n.02')]
+>>>
+>>> # only Adjective
+>>> synsets("แรง", pos="a", lang="tha")
+[Synset('hard.s.10'), Synset('strong.s.02')]
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.synset

+
+
+pythainlp.corpus.wordnet.synset(name_synsets)[source]
+

This function returns the synonym set (synset) given the name of the synset +(i.e. ‘dog.n.01’, ‘chase.v.01’).

+
+
Parameters:
+

name_synsets (str) – name of the synset

+
+
Returns:
+

Synset of the given name

+
+
Return type:
+

Synset

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import synset
+>>>
+>>> difficult = synset('difficult.a.01')
+>>> difficult
+Synset('difficult.a.01')
+>>>
+>>> difficult.definition()
+'not easy; requiring great physical or mental effort to accomplish
+           or comprehend or endure'
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.all_lemma_names

+
+
+pythainlp.corpus.wordnet.all_lemma_names(pos: str | None = None, lang: str = 'tha')[source]
+

This function returns all lemma names for all synsets of the given +part of speech tag and language. If part of speech tag is not +specified, all synsets of all parts of speech will be used.

+
+
Parameters:
+
    +
  • pos (str) – constraint of the part of speech (i.e. n for Noun, +v for Verb, a for Adjective, s for +Adjective satellites, and r for Adverb). +By default, pos is None.

  • +
  • lang (str) – abbreviation of language (i.e. eng, tha). +By default, it is tha.

  • +
+
+
Returns:
+

Synset of lemmas names given the POS and language

+
+
Return type:
+

list[Synset]

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import all_lemma_names
+>>>
+>>> all_lemma_names()
+['อเมริโก_เวสปุชชี',
+ 'เมืองชีย์เอนเน',
+ 'การรับเลี้ยงบุตรบุญธรรม',
+ 'ผู้กัด',
+ 'ตกแต่งเรือด้วยธง',
+ 'จิโอวานนิ_เวอร์จินิโอ',...]
+>>>
+>>> len(all_lemma_names())
+80508
+>>>
+>>> all_lemma_names(pos="a")
+['ซึ่งไม่มีแอลกอฮอล์',
+ 'ซึ่งตรงไปตรงมา',
+ 'ที่เส้นศูนย์สูตร',
+ 'ทางจิตใจ',...]
+>>>
+>>> len(all_lemma_names(pos="a"))
+5277
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.all_synsets

+
+
+pythainlp.corpus.wordnet.all_synsets(pos: str | None = None)[source]
+

This function iterates over all synsets constrained by the given +part of speech tag.

+
+
Parameters:
+

pos (str) – part of speech tag

+
+
Returns:
+

list of synsets constrained by the given part of speech tag.

+
+
Return type:
+

Iterable[Synset]

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import all_synsets
+>>>
+>>> generator = all_synsets(pos="n")
+>>> next(generator)
+Synset('entity.n.01')
+>>> next(generator)
+Synset('physical_entity.n.01')
+>>> next(generator)
+Synset('abstraction.n.06')
+>>>
+>>>  generator = all_synsets()
+>>> next(generator)
+Synset('able.a.01')
+>>> next(generator)
+Synset('unable.a.01')
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.langs

+
+
+pythainlp.corpus.wordnet.langs()[source]
+

This function returns a set of ISO-639 language codes.

+
+
Returns:
+

ISO-639 language codes

+
+
Return type:
+

list[str]

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import langs
+>>> langs()
+['eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan',
+ 'ell', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb',
+ 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno', 'nob',
+ 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha',
+ 'zsm']
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.lemmas

+
+
+pythainlp.corpus.wordnet.lemmas(word: str, pos: str | None = None, lang: str = 'tha')[source]
+

This function returns all lemmas given the word with an optional +argument to constrain the part of speech of the word.

+
+
Parameters:
+
    +
  • word (str) – word to find lemmas of

  • +
  • pos (str) – constraint of the part of speech (i.e. n for Noun, +v for Verb, a for Adjective, s for +Adjective satellites, and r for Adverb)

  • +
  • lang (str) – abbreviation of language (i.e. eng, tha). +By default, it is tha.

  • +
+
+
Returns:
+

Synset of all lemmas of the word constrained +by the argument pos.

+
+
Return type:
+

list[Lemma]

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import lemmas
+>>>
+>>> lemmas("โปรด")
+[Lemma('like.v.03.โปรด'), Lemma('like.v.02.โปรด')]
+
+
+
>>> print(lemmas("พระเจ้า"))
+[Lemma('god.n.01.พระเจ้า'), Lemma('godhead.n.01.พระเจ้า'),
+ Lemma('father.n.06.พระเจ้า'), Lemma('god.n.03.พระเจ้า')]
+
+
+

When the part of speech tag is specified:

+
>>> from pythainlp.corpus.wordnet import lemmas
+>>>
+>>> lemmas("ม้วน")
+[Lemma('roll.v.18.ม้วน'), Lemma('roll.v.17.ม้วน'),
+ Lemma('roll.v.08.ม้วน'),  Lemma('curl.v.01.ม้วน'),
+ Lemma('roll_up.v.01.ม้วน'), Lemma('wind.v.03.ม้วน'),
+ Lemma('roll.n.11.ม้วน')]
+>>>
+>>> # only lemmas with Noun as the part of speech
+>>> lemmas("ม้วน", pos="n")
+[Lemma('roll.n.11.ม้วน')]
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.lemma

+
+
+pythainlp.corpus.wordnet.lemma(name_synsets)[source]
+

This function returns lemma object given the name.

+
+

Note

+

Support only English language (eng).

+
+
+
Parameters:
+

name_synsets (str) – name of the synset

+
+
Returns:
+

lemma object with the given name

+
+
Return type:
+

Lemma

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import lemma
+>>>
+>>> lemma('practice.v.01.exercise')
+Lemma('practice.v.01.exercise')
+>>>
+>>> lemma('drill.v.03.exercise')
+Lemma('drill.v.03.exercise')
+>>>
+>>> lemma('exercise.n.01.exercise')
+Lemma('exercise.n.01.exercise')
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.lemma_from_key

+
+
+pythainlp.corpus.wordnet.lemma_from_key(key)[source]
+

This function returns lemma object given the lemma key. +This is similar to lemma() but it needs to be given the key +of lemma instead of the name of lemma.

+
+

Note

+

Support only English language (eng).

+
+
+
Parameters:
+

key (str) – key of the lemma object

+
+
Returns:
+

lemma object with the given key

+
+
Return type:
+

Lemma

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import lemma, lemma_from_key
+>>>
+>>> practice = lemma('practice.v.01.exercise')
+>>> practice.key()
+exercise%2:41:00::
+>>> lemma_from_key(practice.key())
+Lemma('practice.v.01.exercise')
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.path_similarity

+
+
+pythainlp.corpus.wordnet.path_similarity(synsets1, synsets2)[source]
+

This function returns similarity between two synsets based on the +shortest path distance calculated using the equation below.

+
+\[path\_similarity = {1 \over shortest\_path\_distance(synsets1, + synsets2) + 1}\]
+

The shortest path distance is calculated by the connection through +the is-a (hypernym/hyponym) taxonomy. The score is in the range of +0 to 1. Path similarity of 1 indicates identicality.

+
+
Parameters:
+
    +
  • synsets1 (Synset) – first synset supplied to measures +the path similarity with

  • +
  • synsets2 (Synset) – second synset supplied to measures +the path similarity with

  • +
+
+
Returns:
+

path similarity between two synsets

+
+
Return type:
+

float

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import path_similarity, synset
+>>>
+>>> entity = synset('entity.n.01')
+>>> obj = synset('object.n.01')
+>>> cat = synset('cat.n.01')
+>>>
+>>> path_similarity(entity, obj)
+0.3333333333333333
+>>> path_similarity(entity, cat)
+0.07142857142857142
+>>> path_similarity(obj, cat)
+0.08333333333333333
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.lch_similarity

+
+
+pythainlp.corpus.wordnet.lch_similarity(synsets1, synsets2)[source]
+

This function returns Leacock Chodorow similarity (LCH) +between two synsets, based on the shortest path distance +and the maximum depth of the taxonomy. The equation to +calculate LCH similarity is shown below:

+
+\[lch\_similarity = {-log(shortest\_path\_distance(synsets1, + synsets2) \over 2 * taxonomy\_depth}\]
+
+
Parameters:
+
    +
  • synsets1 (Synset) – first synset supplied to measures +the LCH similarity

  • +
  • synsets2 (Synset) – second synset supplied to measures +the LCH similarity

  • +
+
+
Returns:
+

LCH similarity between two synsets

+
+
Return type:
+

float

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import lch_similarity, synset
+>>>
+>>> entity = synset('entity.n.01')
+>>> obj = synset('object.n.01')
+>>> cat = synset('cat.n.01')
+>>>
+>>> lch_similarity(entity, obj)
+2.538973871058276
+>>> lch_similarity(entity, cat)
+0.9985288301111273
+>>> lch_similarity(obj, cat)
+1.1526795099383855
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.wup_similarity

+
+
+pythainlp.corpus.wordnet.wup_similarity(synsets1, synsets2)[source]
+

This function returns Wu-Palmer similarity (WUP) between two synsets, +based on the depth of the two senses in the taxonomy and their +Least Common Subsumer (most specific ancestor node).

+
+
Parameters:
+
    +
  • synsets1 (Synset) – first synset supplied to measures +the WUP similarity with

  • +
  • synsets2 (Synset) – second synset supplied to measures +the WUP similarity with

  • +
+
+
Returns:
+

WUP similarity between two synsets

+
+
Return type:
+

float

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import wup_similarity, synset
+>>>
+>>> entity = synset('entity.n.01')
+>>> obj = synset('object.n.01')
+>>> cat = synset('cat.n.01')
+>>>
+>>> wup_similarity(entity, obj)
+0.5
+>>> wup_similarity(entity, cat)
+0.13333333333333333
+>>> wup_similarity(obj, cat)
+0.35294117647058826
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.morphy

+
+
+pythainlp.corpus.wordnet.morphy(form, pos: str | None = None)[source]
+

This function finds a possible base form for the given form, +with the given part of speech.

+
+
Parameters:
+
    +
  • form (str) – the form to finds the base form of

  • +
  • pos (str) – part of speech tag of words to be searched

  • +
+
+
Returns:
+

base form of the given form

+
+
Return type:
+

str

+
+
Example:
+
>>> from pythainlp.corpus.wordnet import morphy
+>>>
+>>> morphy("dogs")
+'dogs'
+>>>
+>>> morphy("thieves")
+'thief'
+>>>
+>>> morphy("mixed")
+'mix'
+>>>
+>>> morphy("calculated")
+'calculate'
+
+
+
+
+
+ +
+
+

pythainlp.corpus.wordnet.custom_lemmas

+
+
+pythainlp.corpus.wordnet.custom_lemmas(tab_file, lang: str)[source]
+

This function reads a custom tab file +(see: http://compling.hss.ntu.edu.sg/omw/) +containing mappings of lemmas in the given language.

+
+
Parameters:
+
    +
  • tab_file – Tab file as a file or file-like object

  • +
  • lang (str) – abbreviation of language (i.e. eng, tha).

  • +
+
+
+
+ +
+

Definition

+
+
+
+

Synset

+

A synset is a set of synonyms that share a common meaning. The WordNet module provides functionality to work with these synsets.

+

This documentation is designed to help you navigate and use the various resources and modules available in the pythainlp.corpus package effectively. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support.

+

We hope you find this documentation helpful for your natural language processing tasks in the Thai language.

+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/el.html b/5.1/api/el.html new file mode 100644 index 0000000..1bda64c --- /dev/null +++ b/5.1/api/el.html @@ -0,0 +1,225 @@ + + + + + + + + + pythainlp.el — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.el

+

The pythainlp.el module is an essential component of Thai Entity Linking within the PyThaiNLP library. Entity Linking is a key natural language processing task that associates mentions in text with corresponding entities in a knowledge base.

+
+
+class pythainlp.el.EntityLinker(model_name: str = 'bela', device: str = 'cuda', tag: str = 'wikidata')[source]
+
+
+__init__(model_name: str = 'bela', device: str = 'cuda', tag: str = 'wikidata')[source]
+

EntityLinker

+
+
Parameters:
+
    +
  • model_name (str) – model name (bela)

  • +
  • device (str) – device for running model on

  • +
  • tag (str) – Entity linking tag (wikidata)

  • +
+
+
+

You can read about bela model at https://github.com/PyThaiNLP/MultiEL.

+
+ +
+
+get_el(list_text: List[str] | str) List[dict] | str[source]
+

Get Entity Linking from Thai Text

+
+
Parameters:
+

str] (str Union[List[str],) – list of Thai text or text

+
+
Returns:
+

list of entity linking

+
+
Return type:
+

Union[List[dict], str]

+
+
Example:
+

+
+
from pythainlp.el import EntityLinker
+
+el = EntityLinker(device="cuda")
+print(el.get_el("จ๊อบเคยเป็นซีอีโอบริษัทแอปเปิล"))
+# output: [{'offsets': [11, 23],
+# 'lengths': [6, 7],
+# 'entities': ['Q484876', 'Q312'],
+# 'md_scores': [0.30301809310913086, 0.6399497389793396],
+# 'el_scores': [0.7142490744590759, 0.8657019734382629]}]
+
+
+
+ +
+ +
+

EntityLinker

+

The EntityLinker class is the core component of the pythainlp.el module, responsible for Thai Entity Linking. Entity Linking, also known as Named Entity Linking (NEL), plays a critical role in various applications, including question answering, information retrieval, and knowledge graph construction.

+
+

Example

+

Here’s a simple example of how to use the EntityLinker class:

+
+
::

from pythainlp.el import EntityLinker

+

text = “กรุงเทพเป็นเมืองหลวงของประเทศไทย” +el = EntityLinker() +linked_entities = el.get_el(text) +print(linked_entities)

+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/generate.html b/5.1/api/generate.html new file mode 100644 index 0000000..aec73d7 --- /dev/null +++ b/5.1/api/generate.html @@ -0,0 +1,512 @@ + + + + + + + + + pythainlp.generate — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.generate

+

The pythainlp.generate module is a powerful tool for generating Thai text using PyThaiNLP. It includes several classes and functions that enable users to create text based on various language models and n-gram models.

+
+

Modules

+
+

Unigram

+
+
+class pythainlp.generate.Unigram(name: str = 'tnc')[source]
+

Text generator using Unigram

+
+
Parameters:
+

name (str) – corpus name +* tnc - Thai National Corpus (default) +* ttc - Thai Textbook Corpus (TTC) +* oscar - OSCAR Corpus

+
+
+
+
+__init__(name: str = 'tnc')[source]
+
+ +
+
+gen_sentence(start_seq: str = '', N: int = 3, prob: float = 0.001, output_str: bool = True, duplicate: bool = False) List[str] | str[source]
+
+
Parameters:
+
    +
  • start_seq (str) – word to begin sentence with

  • +
  • N (int) – number of words

  • +
  • output_str (bool) – output as string

  • +
  • duplicate (bool) – allow duplicate words in sentence

  • +
+
+
Returns:
+

list of words or a word string

+
+
Return type:
+

List[str], str

+
+
Example:
+

+
+
from pythainlp.generate import Unigram
+
+gen = Unigram()
+
+gen.gen_sentence("แมว")
+# output: 'แมวเวลานะนั้น'
+
+
+
+ +
+ +

The Unigram class provides functionality for generating text based on unigram language models. Unigrams are single words or tokens, and this class allows you to create text by selecting words probabilistically based on their frequencies in the training data.

+
+
+

Bigram

+
+
+class pythainlp.generate.Bigram(name: str = 'tnc')[source]
+

Text generator using Bigram

+
+
Parameters:
+

name (str) – corpus name +* tnc - Thai National Corpus (default)

+
+
+
+
+__init__(name: str = 'tnc')[source]
+
+ +
+
+prob(t1: str, t2: str) float[source]
+

probability of word

+
+
Parameters:
+
    +
  • t1 (int) – text 1

  • +
  • t2 (int) – text 2

  • +
+
+
Returns:
+

probability value

+
+
Return type:
+

float

+
+
+
+ +
+
+gen_sentence(start_seq: str = '', N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False) List[str] | str[source]
+
+
Parameters:
+
    +
  • start_seq (str) – word to begin sentence with

  • +
  • N (int) – number of words

  • +
  • output_str (bool) – output as string

  • +
  • duplicate (bool) – allow duplicate words in sentence

  • +
+
+
Returns:
+

list of words or a word string

+
+
Return type:
+

List[str], str

+
+
Example:
+

+
+
from pythainlp.generate import Bigram
+
+gen = Bigram()
+
+gen.gen_sentence("แมว")
+# output: 'แมวไม่ได้รับเชื้อมัน'
+
+
+
+ +
+ +

The Bigram class is designed for generating text using bigram language models. Bigrams are sequences of two words, and this class enables you to generate text by predicting the next word based on the previous word’s probability.

+
+
+

Trigram

+
+
+class pythainlp.generate.Trigram(name: str = 'tnc')[source]
+

Text generator using Trigram

+
+
Parameters:
+

name (str) – corpus name +* tnc - Thai National Corpus (default)

+
+
+
+
+__init__(name: str = 'tnc')[source]
+
+ +
+
+prob(t1: str, t2: str, t3: str) float[source]
+

probability of word

+
+
Parameters:
+
    +
  • t1 (int) – text 1

  • +
  • t2 (int) – text 2

  • +
  • t3 (int) – text 3

  • +
+
+
Returns:
+

probability value

+
+
Return type:
+

float

+
+
+
+ +
+
+gen_sentence(start_seq: str = '', N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False) List[str] | str[source]
+
+
Parameters:
+
    +
  • start_seq (str) – word to begin sentence with

  • +
  • N (int) – number of words

  • +
  • output_str (bool) – output as string

  • +
  • duplicate (bool) – allow duplicate words in sentence

  • +
+
+
Returns:
+

list of words or a word string

+
+
Return type:
+

List[str], str

+
+
Example:
+

+
+
from pythainlp.generate import Trigram
+
+gen = Trigram()
+
+gen.gen_sentence()
+# output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
+
+
+
+ +
+ +

The Trigram class extends text generation to trigram language models. Trigrams consist of three consecutive words, and this class facilitates the creation of text by predicting the next word based on the two preceding words’ probabilities.

+
+
+

pythainlp.generate.thai2fit.gen_sentence

+

The function pythainlp.generate.thai2fit.gen_sentence() offers a convenient way to generate sentences using the Thai2Vec language model. It takes a seed text as input and generates a coherent sentence based on the provided context.

+
+
+

pythainlp.generate.wangchanglm.WangChanGLM

+
+
+class pythainlp.generate.wangchanglm.WangChanGLM[source]
+
+
+__init__()[source]
+
+ +
+
+is_exclude(text: str) bool[source]
+
+ +
+
+load_model(model_path: str = 'pythainlp/wangchanglm-7.5B-sft-en-sharded', return_dict: bool = True, load_in_8bit: bool = False, device: str = 'cuda', torch_dtype=torch.float16, offload_folder: str = './', low_cpu_mem_usage: bool = True)[source]
+

Load model

+
+
Parameters:
+
    +
  • model_path (str) – model path

  • +
  • return_dict (bool) – return dict

  • +
  • load_in_8bit (bool) – load model in 8bit

  • +
  • device (str) – device (cpu, cuda or other)

  • +
  • torch_dtype (torch_dtype) – torch_dtype

  • +
  • offload_folder (str) – offload folder

  • +
  • low_cpu_mem_usage (bool) – low cpu mem usage

  • +
+
+
+
+ +
+
+gen_instruct(text: str, max_new_tokens: int = 512, top_p: float = 0.95, temperature: float = 0.9, top_k: int = 50, no_repeat_ngram_size: int = 2, typical_p: float = 1.0, thai_only: bool = True, skip_special_tokens: bool = True)[source]
+

Generate Instruct

+
+
Parameters:
+
    +
  • text (str) – text

  • +
  • max_new_tokens (int) – maximum number of new tokens

  • +
  • top_p (float) – top p

  • +
  • temperature (float) – temperature

  • +
  • top_k (int) – top k

  • +
  • no_repeat_ngram_size (int) – do not repeat ngram size

  • +
  • typical_p (float) – typical p

  • +
  • thai_only (bool) – Thai only

  • +
  • skip_special_tokens (bool) – skip special tokens

  • +
+
+
Returns:
+

the answer from Instruct

+
+
Return type:
+

str

+
+
+
+ +
+
+instruct_generate(instruct: str, context: str | None = None, max_new_tokens=512, temperature: float = 0.9, top_p: float = 0.95, top_k: int = 50, no_repeat_ngram_size: int = 2, typical_p: float = 1, thai_only: bool = True, skip_special_tokens: bool = True)[source]
+

Generate Instruct

+
+
Parameters:
+
    +
  • instruct (str) – Instruct

  • +
  • context (str) – context

  • +
  • max_new_tokens (int) – maximum number of new tokens

  • +
  • top_p (float) – top p

  • +
  • temperature (float) – temperature

  • +
  • top_k (int) – top k

  • +
  • no_repeat_ngram_size (int) – do not repeat ngram size

  • +
  • typical_p (float) – typical p

  • +
  • thai_only (bool) – Thai only

  • +
  • skip_special_tokens (bool) – skip special tokens

  • +
+
+
Returns:
+

the answer from Instruct

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.generate.wangchanglm import WangChanGLM
+import torch
+
+model = WangChanGLM()
+
+model.load_model(device="cpu",torch_dtype=torch.bfloat16)
+
+print(model.instruct_generate(instruct="ขอวิธีลดน้ำหนัก"))
+# output: ลดน้ําหนักให้ได้ผล ต้องทําอย่างค่อยเป็นค่อยไป
+# ปรับเปลี่ยนพฤติกรรมการกินอาหาร
+# ออกกําลังกายอย่างสม่ําเสมอ
+# และพักผ่อนให้เพียงพอ
+# ที่สําคัญควรหลีกเลี่ยงอาหารที่มีแคลอรี่สูง
+# เช่น อาหารทอด อาหารมัน อาหารที่มีน้ําตาลสูง
+# และเครื่องดื่มแอลกอฮอล์
+
+
+
+ +
+ +

The WangChanGLM class is a part of the pythainlp.generate.wangchanglm module, offering text generation capabilities. It includes methods for creating text using the WangChanGLM language model.

+
+
+

Usage

+

To use the text generation capabilities provided by the pythainlp.generate module, follow these steps:

+
    +
  1. Select the appropriate class or function based on the type of language model you want to use (Unigram, Bigram, Trigram, Thai2Vec, or WangChanGLM).

  2. +
  3. Initialize the selected class or use the function with the necessary parameters.

  4. +
  5. Call the appropriate methods to generate text based on the chosen model.

  6. +
  7. Utilize the generated text for various applications, such as chatbots, content generation, and more.

  8. +
+
+
+

Example

+

Here’s a simple example of how to generate text using the Unigram class:

+
+
::

from pythainlp.generate import Unigram

+

# Initialize the Unigram model +unigram = Unigram()

+

# Generate a sentence +sentence = unigram.gen_sentence(“สวัสดีครับ”)

+

print(sentence)

+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/khavee.html b/5.1/api/khavee.html new file mode 100644 index 0000000..606d3ab --- /dev/null +++ b/5.1/api/khavee.html @@ -0,0 +1,392 @@ + + + + + + + + + pythainlp.khavee — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.khavee

+

The pythainlp.khavee module is a powerful toolkit designed for working with Thai poetry. The term “khavee” corresponds to “กวี” in the Thai language, which translates to “Poetry” in English. This toolkit equips users with the tools and utilities necessary for the creation, analysis, and verification of Thai poetry.

+
+

Modules

+
+

KhaveeVerifier

+
+
+class pythainlp.khavee.KhaveeVerifier[source]
+
+
+__init__()[source]
+

KhaveeVerifier: Thai Poetry verifier

+
+ +
+
+check_sara(word: str) str[source]
+

Check the vowels in the Thai word.

+
+
Parameters:
+

word (str) – Thai word

+
+
Returns:
+

vowel name of the word

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.khavee import KhaveeVerifier
+
+kv = KhaveeVerifier()
+
+print(kv.check_sara("เริง"))
+# output: 'เออ'
+
+
+
+ +
+
+check_marttra(word: str) str[source]
+

Check the Thai spelling Section in the Thai word.

+
+
Parameters:
+

word (str) – Thai word

+
+
Returns:
+

name of spelling Section of the word.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.khavee import KhaveeVerifier
+
+kv = KhaveeVerifier()
+
+print(kv.check_marttra("สาว"))
+# output: 'เกอว'
+
+
+
+ +
+
+is_sumpus(word1: str, word2: str) bool[source]
+

Check the rhyme between two words.

+
+
Parameters:
+
    +
  • word1 (str) – Thai word

  • +
  • word2 (str) – Thai word

  • +
+
+
Returns:
+

boolean

+
+
Return type:
+

bool

+
+
Example:
+

+
+
from pythainlp.khavee import KhaveeVerifier
+
+kv = KhaveeVerifier()
+
+print(kv.is_sumpus("สรร", "อัน"))
+# output: True
+
+print(kv.is_sumpus("สรร", "แมว"))
+# output: False
+
+
+
+ +
+
+check_karu_lahu(text)[source]
+
+ +
+
+check_klon(text: str, k_type: int = 8) List[str] | str[source]
+

Check the suitability of the poem according to Thai principles.

+
+
Parameters:
+
    +
  • text (str) – Thai poem

  • +
  • k_type (int) – type of Thai poem

  • +
+
+
Returns:
+

the check results of the suitability of the poem according to Thai principles.

+
+
Return type:
+

Union[List[str], str]

+
+
Example:
+

+
+
from pythainlp.khavee import KhaveeVerifier
+
+kv = KhaveeVerifier()
+
+print(kv.check_klon(
+    'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง                 มีคนจับจอง เขาชื่อน้องเธียร',
+    k_type=4
+))
+# output: The poem is correct according to the principle.
+
+print(kv.check_klon(
+    'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง                 เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร',
+    k_type=4
+))
+# output: [
+    "Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2",
+    "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2"
+]
+
+
+
+ +
+
+check_aek_too(text: List[str] | str, dead_syllable_as_aek: bool = False) List[bool] | List[str] | bool | str[source]
+

Checker of Thai tonal words

+
+
Parameters:
+
    +
  • text (Union[List[str], str]) – Thai word or list of Thai words

  • +
  • dead_syllable_as_aek (bool) – if True, dead syllable will be considered as aek

  • +
+
+
Returns:
+

the check result if the word is aek or too or False (not both) or list of check results if input is list

+
+
Return type:
+

Union[List[bool], List[str], bool, str]

+
+
Example:
+

+
+
from pythainlp.khavee import KhaveeVerifier
+
+kv = KhaveeVerifier()
+
+# การเช็คคำเอกโท
+print(
+    kv.check_aek_too("เอง"),
+    kv.check_aek_too("เอ่ง"),
+    kv.check_aek_too("เอ้ง"),
+)
+# -> False, aek, too
+print(kv.check_aek_too(["เอง", "เอ่ง", "เอ้ง"]))  # ใช้ List ได้เหมือนกัน
+# -> [False, 'aek', 'too']
+
+
+
+ +
+
+handle_karun_sound_silence(word: str) str[source]
+

Handle silent sounds in Thai words using ‘์’ character (Karun) +by stripping all characters before the ‘Karun’ character that should be silenced

+
+
Parameters:
+

text (str) – Thai word

+
+
Returns:
+

Thai word with silent words stripped

+
+
Return type:
+

str

+
+
+
+ +
+
+__dict__ = mappingproxy({'__module__': 'pythainlp.khavee.core', '__init__': <function KhaveeVerifier.__init__>, 'check_sara': <function KhaveeVerifier.check_sara>, 'check_marttra': <function KhaveeVerifier.check_marttra>, 'is_sumpus': <function KhaveeVerifier.is_sumpus>, 'check_karu_lahu': <function KhaveeVerifier.check_karu_lahu>, 'check_klon': <function KhaveeVerifier.check_klon>, 'check_aek_too': <function KhaveeVerifier.check_aek_too>, 'handle_karun_sound_silence': <function KhaveeVerifier.handle_karun_sound_silence>, '__dict__': <attribute '__dict__' of 'KhaveeVerifier' objects>, '__weakref__': <attribute '__weakref__' of 'KhaveeVerifier' objects>, '__doc__': None, '__annotations__': {}})
+
+ +
+
+__module__ = 'pythainlp.khavee.core'
+
+ +
+ +

The KhaveeVerifier class is the primary component of the pythainlp.khavee module, dedicated to the verification of Thai poetry. It offers a range of functions and methods for analyzing and validating Thai poetry, ensuring its adherence to the rules and structure of classical Thai poetic forms.

+
+
+

Example

+

Here’s a basic example of how to use the KhaveeVerifier class to verify Thai poetry:

+
from pythainlp.khavee import KhaveeVerifier
+
+# Initialize a KhaveeVerifier instance
+verifier = KhaveeVerifier()
+
+# Text to verify
+poem_text = "ดอกไม้สวยงาม แสนสดใส"
+
+# Verify if the text is Thai poetry
+is_poetry = verifier.is_khavee(poem_text)
+
+print(f"The provided text is Thai poetry: {is_poetry}")
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/lm.html b/5.1/api/lm.html new file mode 100644 index 0000000..e0e0ab3 --- /dev/null +++ b/5.1/api/lm.html @@ -0,0 +1,205 @@ + + + + + + + + + pythainlp.lm — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.lm

+
+

Modules

+
+
+pythainlp.lm.calculate_ngram_counts(list_words: List[str], n_min: int = 2, n_max: int = 4) Dict[Tuple[str], int][source]
+

Calculates the counts of n-grams in the list words for the specified range.

+
+
Parameters:
+
    +
  • list_words (List[str]) – List of string

  • +
  • n_min (int) – The minimum n-gram size (default: 2).

  • +
  • n_max (int) – The maximum n-gram size (default: 4).

  • +
+
+
Returns:
+

A dictionary where keys are n-grams and values are their counts.

+
+
Return type:
+

Dict[Tuple[str], int]

+
+
+
+ +
+
+pythainlp.lm.remove_repeated_ngrams(string_list: List[str], n: int = 2) List[str][source]
+

Remove repeated n-grams

+
+
Parameters:
+
    +
  • string_list (List[str]) – List of string

  • +
  • n (int) – n-gram size

  • +
+
+
Returns:
+

List of string

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.lm import remove_repeated_ngrams
+
+remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1)
+# output: ['เอา', 'แบบ', 'ไหน']
+
+
+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/morpheme.html b/5.1/api/morpheme.html new file mode 100644 index 0000000..3bd6538 --- /dev/null +++ b/5.1/api/morpheme.html @@ -0,0 +1,227 @@ + + + + + + + + + pythainlp.morpheme — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.morpheme

+

The pythainlp.benchmarks module is collect functions for morpheme analysis, word formation and more for Thai language.

+
+
+pythainlp.morpheme.nighit(w1: str, w2: str) str[source]
+

Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new words from Pali language in Thai. +The function use simple method to create new Thai word from two words that the root is from Pali language.

+

Read more: https://www.trueplookpanya.com/learning/detail/1180

+
+
Parameters:
+
    +
  • w1 (str) – A Thai word that has a nighit.

  • +
  • w2 (str) – A Thai word.

  • +
+
+
Returns:
+

Thai word.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.morpheme import nighit
+
+assert nighit("สํ","คีต")=="สังคีต"
+assert nighit("สํ","จร")=="สัญจร"
+assert nighit("สํ","ฐาน")=="สัณฐาน"
+assert nighit("สํ","นิษฐาน")=="สันนิษฐาน"
+assert nighit("สํ","ปทา")=="สัมปทา"
+assert nighit("สํ","โยค")=="สังโยค"
+
+
+
+ +
+
+pythainlp.morpheme.is_native_thai(word: str) bool[source]
+

Check if a word is an “native Thai word” (Thai: “คำไทยแท้”) +This function is based on a simple heuristic algorithm +and cannot be entirely reliable.

+
+
Parameters:
+

word (str) – word

+
+
Returns:
+

True or False

+
+
Return type:
+

bool

+
+
Example:
+

+
+

English word:

+
from pythainlp.util import is_native_thai
+
+is_native_thai("Avocado")
+# output: False
+
+
+

Native Thai word:

+
is_native_thai("มะม่วง")
+# output: True
+is_native_thai("ตะวัน")
+# output: True
+
+
+

Non-native Thai word:

+
is_native_thai("สามารถ")
+# output: False
+is_native_thai("อิสริยาภรณ์")
+# output: False
+
+
+

The is_native_thai function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.

+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/parse.html b/5.1/api/parse.html new file mode 100644 index 0000000..792b5c4 --- /dev/null +++ b/5.1/api/parse.html @@ -0,0 +1,260 @@ + + + + + + + + + pythainlp.parse — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.parse

+

The pythainlp.parse module provides dependency parsing for the Thai language. Dependency parsing is a fundamental task in natural language processing that involves identifying the grammatical relationships between words in a sentence, which helps to analyze sentence structure and meaning.

+
+

Modules

+
+

dependency_parsing

+
+
+pythainlp.parse.dependency_parsing(text: str, model: str | None = None, tag: str = 'str', engine: str = 'esupar') List[List[str]] | str[source]
+

Dependency Parsing

+
+
Parameters:
+
    +
  • text (str) – text to apply dependency parsing to

  • +
  • model (str) – model for using with engine (for esupar and transformers_ud)

  • +
  • tag (str) – output type (str or list)

  • +
  • engine (str) – the name of dependency parser

  • +
+
+
Returns:
+

str (conllu) or List

+
+
Return type:
+

Union[List[List[str]], str]

+
+
+
+
Options for engine
    +
  • esupar (default) - Tokenizer, POS tagger and Dependency parser using BERT/RoBERTa/DeBERTa models. GitHub

  • +
  • spacy_thai - Tokenizer, POS tagger, and dependency parser for the Thai language, using Universal Dependencies. GitHub

  • +
  • transformers_ud - TransformersUD GitHub

  • +
  • ud_goeswith - POS tagging and dependency parsing using goeswith for subwords

  • +
+
+
Options for model (esupar engine)
    +
  • th (default) - KoichiYasuoka/roberta-base-thai-spm-upos model Huggingface

  • +
  • KoichiYasuoka/deberta-base-thai-upos - DeBERTa(V2) model pre-trained on Thai Wikipedia texts for POS tagging and dependency parsing Huggingface

  • +
  • KoichiYasuoka/roberta-base-thai-syllable-upos - RoBERTa model pre-trained on Thai Wikipedia texts for POS tagging and dependency parsing. (syllable level) Huggingface

  • +
  • KoichiYasuoka/roberta-base-thai-char-upos - RoBERTa model pre-trained on Thai Wikipedia texts for POS tagging and dependency parsing. (char level) Huggingface

  • +
+
+
+

If you want to train models for esupar, you can read Huggingface

+
+
Options for model (transformers_ud engine)
    +
  • KoichiYasuoka/deberta-base-thai-ud-head (default) - DeBERTa(V2) model pretrained on Thai Wikipedia texts for dependency parsing (head-detection using Universal Dependencies) and question-answering, derived from deberta-base-thai. trained by th_blackboard.conll. Huggingface

  • +
  • KoichiYasuoka/roberta-base-thai-spm-ud-head - roberta model pretrained on Thai Wikipedia texts for dependency parsing. Huggingface

  • +
+
+
Options for model (ud_goeswith engine)
    +
  • KoichiYasuoka/deberta-base-thai-ud-goeswith (default) - This is a DeBERTa(V2) model pre-trained on Thai Wikipedia texts for POS tagging and dependency parsing (using goeswith for subwords) Huggingface

  • +
+
+
+
+
Example:
+

+
+
from pythainlp.parse import dependency_parsing
+
+print(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
+# output:
+# 1       ผม      _       PRON    _       _       3       nsubj   _       SpaceAfter=No
+# 2       เป็น     _       VERB    _       _       3       cop     _       SpaceAfter=No
+# 3       คน      _       NOUN    _       _       0       root    _       SpaceAfter=No
+# 4       ดี       _       VERB    _       _       3       acl     _       SpaceAfter=No
+
+print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))
+# output:
+# 1       ผม              PRON    PPRS    _       2       nsubj   _       SpaceAfter=No
+# 2       เป็น             VERB    VSTA    _       0       ROOT    _       SpaceAfter=No
+# 3       คนดี             NOUN    NCMN    _       2       obj     _       SpaceAfter=No
+
+
+
+ +

The dependency_parsing function is the core component of the pythainlp.parse module. It offers dependency parsing capabilities for the Thai language. Given a Thai sentence as input, this function parses the sentence to identify the grammatical relationships between words, creating a dependency tree that represents the sentence’s structure.

+
+
+

Usage

+

To use the dependency_parsing function for Thai dependency parsing, follow these steps:

+
    +
  1. Import the pythainlp.parse module.

  2. +
  3. Use the dependency_parsing function with a Thai sentence as input.

  4. +
  5. The function will return the dependency parsing results, which include information about the grammatical relationships between words.

  6. +
+
+
+

Example

+

Here’s a basic example of how to use the dependency_parsing function:

+
from pythainlp.parse import dependency_parsing
+
+# Input Thai sentence
+sentence = "พี่น้องชาวบ้านกำลังเลี้ยงสตางค์ในสวน"
+
+# Perform dependency parsing
+parsing_result = dependency_parsing(sentence)
+
+# Print the parsing result
+print(parsing_result)
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/phayathaibert.html b/5.1/api/phayathaibert.html new file mode 100644 index 0000000..8e342bf --- /dev/null +++ b/5.1/api/phayathaibert.html @@ -0,0 +1,474 @@ + + + + + + + + + pythainlp.phayathaibert — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.phayathaibert

+

The pythainlp.phayathaibert module is built upon the phayathaibert base model.

+
+

Modules

+
+
+class pythainlp.phayathaibert.ThaiTextProcessor[source]
+
+
+__init__()[source]
+
+ +
+
+replace_url(text: str) str[source]
+

Replace url in text with TK_URL (https://stackoverflow.com/a/6041965) +:param str text: text to replace url +:return: text where urls are replaced +:rtype: str +:Example:

+
>>> replace_url("go to https://github.com")
+go to <url>
+
+
+
+ +
+
+rm_brackets(text: str) str[source]
+

Remove all empty brackets and artifacts within brackets from text. +:param str text: text to remove useless brackets +:return: text where all useless brackets are removed +:rtype: str +:Example:

+
>>> rm_brackets("hey() whats[;] up{*&} man(hey)")
+hey whats up man(hey)
+
+
+
+ +
+
+replace_newlines(text: str) str[source]
+
+

Replace newlines in text with spaces. +:param str text: text to replace all newlines with spaces +:return: text where all newlines are replaced with spaces +:rtype: str +:Example:

+
>>> rm_useless_spaces("hey whats
+
+
+
+
+
up”)

hey whats up

+
+
+
+ +
+
+rm_useless_spaces(text: str) str[source]
+

Remove multiple spaces in text. (code from fastai) +:param str text: text to replace useless spaces +:return: text where all spaces are reduced to one +:rtype: str +:Example:

+
>>> rm_useless_spaces("oh         no")
+oh no
+
+
+
+ +
+
+replace_spaces(text: str, space_token: str = '<_>') str[source]
+

Replace spaces with _ +:param str text: text to replace spaces +:return: text where all spaces replaced with _ +:rtype: str +:Example:

+
>>> replace_spaces("oh no")
+oh_no
+
+
+
+ +
+
+replace_rep_after(text: str) str[source]
+

Replace repetitions at the character level in text +:param str text: input text to replace character repetition +:return: text with repetitive tokens removed. +:rtype: str +:Example:

+
>>> text = "กาาาาาาา"
+>>> replace_rep_after(text)
+'กา'
+
+
+
+ +
+
+replace_wrep_post(toks: List[str]) List[str][source]
+

Replace repetitive words post tokenization; +fastai replace_wrep does not work well with Thai. +:param List[str] toks: list of tokens +:return: list of tokens where repetitive words are removed. +:rtype: List[str] +:Example:

+
>>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
+>>> replace_wrep_post(toks)
+['กา', 'น้ำ']
+
+
+
+ +
+
+remove_space(toks: List[str]) List[str][source]
+

Do not include space for bag-of-word models. +:param List[str] toks: list of tokens +:return: List of tokens where space tokens (” “) are filtered out +:rtype: List[str] +:Example:

+
>>> toks = ["ฉัน", "เดิน", " ", "กลับ", "บ้าน"]
+>>> remove_space(toks)
+['ฉัน', 'เดิน', 'กลับ', 'บ้าน']
+
+
+
+ +
+
+preprocess(text: str, pre_rules: ~typing.List[~typing.Callable] = [<function ThaiTextProcessor.rm_brackets>, <function ThaiTextProcessor.replace_newlines>, <function ThaiTextProcessor.rm_useless_spaces>, <function ThaiTextProcessor.replace_spaces>, <function ThaiTextProcessor.replace_rep_after>], tok_func: ~typing.Callable = <function word_tokenize>) str[source]
+
+ +
+ +
+
+class pythainlp.phayathaibert.ThaiTextAugmenter[source]
+
+
+__init__() None[source]
+
+ +
+
+generate(sample_text: str, word_rank: int, max_length: int = 3, sample: bool = False) str[source]
+
+ +
+
+augment(text: str, num_augs: int = 3, sample: bool = False) List[str][source]
+

Text augmentation from PhayaThaiBERT

+
+
Parameters:
+
    +
  • text (str) – Thai text

  • +
  • num_augs (int) – an amount of augmentation text needed as an output

  • +
  • sample (bool) – whether to sample the text as an output or not, true if more word diversity is needed

  • +
+
+
Returns:
+

list of text augment

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.augment.lm import ThaiTextAugmenter
+
+aug = ThaiTextAugmenter()
+aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
+
+# output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
+    'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
+    'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
+    'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
+    'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
+
+
+
+ +
+ +
+
+class pythainlp.phayathaibert.PartOfSpeechTagger(model: str = 'lunarlist/pos_thai_phayathai')[source]
+
+
+__init__(model: str = 'lunarlist/pos_thai_phayathai') None[source]
+
+ +
+
+get_tag(sentence: str, strategy: str = 'simple') List[List[Tuple[str, str]]][source]
+

Marks sentences with part-of-speech (POS) tags.

+
+
Parameters:
+

sentence (str) – a list of lists of tokenized words

+
+
Returns:
+

a list of lists of tuples (word, POS tag)

+
+
Return type:
+

list[list[tuple[str, str]]]

+
+
Example:
+

+
+

Labels POS for given sentence:

+
from pythainlp.phayathaibert.core import PartOfSpeechTagger
+
+tagger = PartOfSpeechTagger()
+tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า")
+# output:
+# [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
+
+
+
+ +
+ +
+
+class pythainlp.phayathaibert.NamedEntityTagger(model: str = 'Pavarissy/phayathaibert-thainer')[source]
+
+
+__init__(model: str = 'Pavarissy/phayathaibert-thainer') None[source]
+
+ +
+
+get_ner(text: str, tag: bool = False, pos: bool = False, strategy: str = 'simple') List[Tuple[str, str]] | List[Tuple[str, str, str]] | str[source]
+

This function tags named entities in text in IOB format.

+
+
Parameters:
+
    +
  • text (str) – text in Thai to be tagged

  • +
  • pos (bool) – output with part-of-speech tags. (PhayaThaiBERT is supported in PartOfSpeechTagger)

  • +
+
+
Returns:
+

a list of tuples associated with tokenized words, NER tags, +POS tags (if the parameter pos is specified as True), +and output HTML-like tags (if the parameter tag is +specified as True). +Otherwise, return a list of tuples associated with tokenized +words and NER tags

+
+
Return type:
+

Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]

+
+
Example:
+
>>> from pythainlp.phayathaibert.core import NamedEntityTagger
+>>>
+>>> tagger = NamedEntityTagger()
+>>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย")
+[('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'),
+('จาก', 'LOCATION'),
+('ประเทศไทย', 'LOCATION')]
+>>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
+'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>                <LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
+
+
+
+
+
+ +
+ +
+
+pythainlp.phayathaibert.segment(sentence: str) List[str][source]
+

Subword tokenize of PhayaThaiBERT, sentencepiece from WangchanBERTa model with vocabulary expansion.

+
+
Parameters:
+

sentence (str) – text to be tokenized

+
+
Returns:
+

list of subwords

+
+
Return type:
+

list[str]

+
+
+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/soundex.html b/5.1/api/soundex.html new file mode 100644 index 0000000..cec4a08 --- /dev/null +++ b/5.1/api/soundex.html @@ -0,0 +1,540 @@ + + + + + + + + + pythainlp.soundex — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.soundex

+

The pythainlp.soundex module provides soundex algorithms for the Thai language. Soundex is a phonetic algorithm used to encode words or names into a standardized representation based on their pronunciation, making it useful for tasks like name matching and search.

+
+

Modules

+
+

soundex

+
+
+pythainlp.soundex.soundex(text: str, engine: str = 'udom83', length: int = 4) str[source]
+

This function converts Thai text into phonetic code.

+
+
Parameters:
+
    +
  • text (str) – word

  • +
  • engine (str) – soundex engine

  • +
  • length (int) – preferred length of the Soundex code (default is 4) for metasound and prayut_and_somchaip only

  • +
+
+
Returns:
+

Soundex code

+
+
Return type:
+

str

+
+
Options for engine:
+
    +
  • udom83 (default) - Thai soundex algorithm proposed +by Vichit Lorchirachoonkul [2]

  • +
  • lk82 - Thai soundex algorithm proposed by +Wannee Udompanich [3]

  • +
  • metasound - Thai soundex algorithm based on a combination +of Metaphone and Soundex proposed by Snae & Brückner [1]

  • +
  • prayut_and_somchaip - Thai-English Cross-Language Transliterated +Word Retrieval using Soundex Technique [4]

  • +
+
+
Example:
+

+
+
from pythainlp.soundex import soundex
+
+soundex("ลัก"), soundex("ลัก", engine='lk82'), \
+    soundex("ลัก", engine='metasound')
+# output: ('ร100000', 'ร1000', 'ล100')
+
+soundex("รัก"), soundex("รัก", engine='lk82'), \
+    soundex("รัก", engine='metasound')
+# output: ('ร100000', 'ร1000', 'ร100')
+
+soundex("รักษ์"), soundex("รักษ์", engine='lk82'), \
+    soundex("รักษ์", engine='metasound')
+# output: ('ร100000', 'ร1000', 'ร100')
+
+soundex("บูรณการ"), soundex("บูรณการ", engine='lk82'), \
+    soundex("บูรณการ", engine='metasound')
+# output: ('บ931900', 'บE419', 'บ551')
+
+soundex("ปัจจุบัน"), soundex("ปัจจุบัน", engine='lk82'), \
+    soundex("ปัจจุบัน", engine='metasound')
+# output: ('ป775300', 'ป3E54', 'ป223')
+
+soundex("vp", engine="prayut_and_somchaip")
+# output: '11'
+soundex("วีพี", engine="prayut_and_somchaip")
+# output: '11'
+
+
+
+ +

The soundex function is a basic Soundex algorithm for the Thai language. It encodes a Thai word into a Soundex code, allowing for approximate matching of words with similar pronunciation.

+
+
+

lk82

+
+
+pythainlp.soundex.lk82(text: str) str[source]
+

This function converts Thai text into phonetic code with the +Thai soundex algorithm named LK82 [3].

+
+
Parameters:
+

text (str) – Thai word

+
+
Returns:
+

LK82 soundex of the given Thai word

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.soundex import lk82
+
+lk82("ลัก")
+# output: 'ร1000'
+
+lk82("รัก")
+# output: 'ร1000'
+
+lk82("รักษ์")
+# output: 'ร1000'
+
+lk82("บูรณการ")
+# output: 'บE419'
+
+lk82("ปัจจุบัน")
+# output: 'ป3E54'
+
+
+
+ +

The lk82 module implements the Thai Soundex algorithm proposed by Vichit Lorchirachoonkul in 1982. This module is suitable for encoding Thai words into Soundex codes for phonetic comparisons.

+
+
+

udom83

+
+
+pythainlp.soundex.udom83(text: str) str[source]
+

This function converts Thai text into phonetic code with the +Thai soundex algorithm named Udom83 [2].

+
+
Parameters:
+

text (str) – Thai word

+
+
Returns:
+

Udom83 soundex

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.soundex import udom83
+
+udom83("ลัก")
+# output : 'ล100'
+
+udom83("รัก")
+# output: 'ร100'
+
+udom83("รักษ์")
+# output: 'ร100'
+
+udom83("บูรณการ")
+# output: 'บ5515'
+
+udom83("ปัจจุบัน")
+# output: 'ป775300'
+
+
+
+ +

The udom83 module is based on a homonymic approach for sound-alike string search. It encodes Thai words using the Wannee Udompanich Soundex algorithm developed in 1983.

+
+
+

metasound

+
+
+pythainlp.soundex.metasound(text: str, length: int = 4) str[source]
+

This function converts Thai text into phonetic code with the +matching technique called MetaSound +[1] (combination between Soundex and Metaphone algorithms). +MetaSound algorithm was developed specifically for the Thai language.

+
+
Parameters:
+
    +
  • text (str) – Thai text

  • +
  • length (int) – preferred length of the MetaSound code (default is 4)

  • +
+
+
Returns:
+

MetaSound for the given text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.soundex.metasound import metasound
+
+metasound("ลัก")
+# output: 'ล100'
+
+metasound("รัก")
+# output: 'ร100'
+
+metasound("รักษ์")
+# output: 'ร100'
+
+metasound("บูรณการ", 5)
+# output: 'บ5515'
+
+metasound("บูรณการ", 6))
+# output: 'บ55150'
+
+metasound("บูรณการ", 4)
+# output: 'บ551'
+
+
+
+ +

The metasound module implements a novel phonetic name matching algorithm with a statistical ontology for analyzing names based on Thai astrology. It offers advanced phonetic matching capabilities for Thai names.

+
+
+

prayut_and_somchaip

+
+
+pythainlp.soundex.prayut_and_somchaip(text: str, length: int = 4) str[source]
+

This function converts English-Thai Cross-Language Transliterated Word into +phonetic code with the matching technique called Soundex [4].

+
+
Parameters:
+
    +
  • text (str) – English-Thai Cross-Language Transliterated Word

  • +
  • length (int) – preferred length of the Soundex code (default is 4)

  • +
+
+
Returns:
+

Soundex for the given text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
+
+prayut_and_somchaip("king", 2)
+# output: '52'
+
+prayut_and_somchaip("คิง", 2)
+# output: '52'
+
+
+
+ +

The prayut_and_somchaip module is designed for Thai-English cross-language transliterated word retrieval using the Soundex technique. It is particularly useful for matching transliterated words in both languages.

+
+
+

pythainlp.soundex.sound.word_approximation

+
+
+pythainlp.soundex.sound.word_approximation(word: str, list_word: List[str]) List[float][source]
+

Thai Word Approximation

+
+
Parameters:
+
    +
  • word (str) – Thai word

  • +
  • list_word (str) – Thai word

  • +
+
+
Returns:
+

List of approximation of words (The smaller the value, the closer)

+
+
Return type:
+

List[float]

+
+
Example:
+

+
+
from pythainlp.soundex.sound import word_approximation
+
+word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
+# output : [0.0, 0.0, 3.875, 8.375]
+
+
+
+ +

The pythainlp.soundex.sound.word_approximation module offers word approximation functionality. It allows users to find Thai words that are phonetically similar to a given word.

+
+
+

pythainlp.soundex.sound.audio_vector

+
+
+pythainlp.soundex.sound.audio_vector(word: str) List[List[int]][source]
+

Convert audio to vector list

+
+
Parameters:
+

word (str) – Thai word

+
+
Returns:
+

List of features from panphon

+
+
Return type:
+

List[List[int]]

+
+
Example:
+

+
+
from pythainlp.soundex.sound import audio_vector
+
+audio_vector("น้ำ")
+# output : [[-1, 1, 1, -1, -1, -1, ...]]
+
+
+
+ +

The pythainlp.soundex.sound.audio_vector module provides audio vector functionality for Thai words. It allows users to work with audio vectors based on phonetic properties.

+
+
+

pythainlp.soundex.sound.word2audio

+
+
+pythainlp.soundex.sound.word2audio(word: str) str[source]
+

Convert word to IPA

+
+
Parameters:
+

word (str) – Thai word

+
+
Returns:
+

IPA with tones removed from the text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.soundex.sound import word2audio
+
+word2audio("น้ำ")
+# output : 'n aː m .'
+
+
+
+ +

The pythainlp.soundex.sound.word2audio module is designed for converting Thai words to audio representations. It enables users to obtain audio vectors for Thai words, which can be used for various applications.

+
+
+
+

References

+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/spell.html b/5.1/api/spell.html new file mode 100644 index 0000000..1d05a28 --- /dev/null +++ b/5.1/api/spell.html @@ -0,0 +1,655 @@ + + + + + + + + + pythainlp.spell — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.spell

+

The pythainlp.spell module is a powerful tool for finding the closest correctly spelled word to a given text in the Thai language. It provides functionalities to correct spelling errors and enhance the accuracy of text processing.

+
+

Modules

+
+

correct

+
+
+pythainlp.spell.correct(word: str, engine: str = 'pn') str[source]
+

Corrects the spelling of the given word by returning +the correctly spelled word.

+
+
Parameters:
+
    +
  • word (str) – word to correct spelling of

  • +
  • engine (str) –

      +
    • pn - Peter Norvig’s algorithm [1] (default)

    • +
    • phunspell - A spell checker utilizing spylls, a port of Hunspell.

    • +
    • symspellpy - symspellpy is a Python port of SymSpell v6.5.

    • +
    • wanchanberta_thai_grammarly - WanchanBERTa Thai Grammarly

    • +
    +

  • +
+
+
Returns:
+

the corrected word

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.spell import correct
+
+correct("เส้นตรบ")
+# output: 'เส้นตรง'
+
+correct("ครัช")
+# output: 'ครับ'
+
+correct("สังเกตุ")
+# output: 'สังเกต'
+
+correct("กระปิ")
+# output: 'กะปิ'
+
+correct("เหตการณ")
+# output: 'เหตุการณ์'
+
+
+
+ +

The correct function is designed to correct the spelling of a single Thai word. Given an input word, this function returns the closest correctly spelled word from the dictionary, making it valuable for spell-checking and text correction tasks.

+
+
+

correct_sent

+
+
+pythainlp.spell.correct_sent(list_words: List[str], engine: str = 'pn') List[str][source]
+

Corrects and returns the spelling of the given sentence

+
+
Parameters:
+
    +
  • list_words (List[str]) – list of words in sentence

  • +
  • engine (str) –

      +
    • pn - Peter Norvig’s algorithm [1] (default)

    • +
    • phunspell - A spell checker utilizing spylls, a port of Hunspell.

    • +
    • symspellpy - symspellpy is a Python port of SymSpell v6.5.

    • +
    • wanchanberta_thai_grammarly - WanchanBERTa Thai Grammarly

    • +
    +

  • +
+
+
Returns:
+

the corrected list of words in sentence

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.spell import correct_sent
+
+correct_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
+# output: ['เด็ก', 'อินเทอร์เน็ต', 'แรง']
+
+
+
+ +

The correct_sent function is an extension of the correct function and is used to correct an entire sentence. It tokenizes the input sentence, corrects each word, and returns the corrected sentence. This is beneficial for proofreading and improving the readability of Thai text.

+
+
+

spell

+
+
+pythainlp.spell.spell(word: str, engine: str = 'pn') List[str][source]
+

Provides a list of possible correct spellings of the given word. +The list of words are from the words in the dictionary +that incurs an edit distance value of 1 or 2. +The result is a list of words sorted by their occurrences +in the spelling dictionary in descending order.

+
+
Parameters:
+
    +
  • word (str) – Word to check spell of

  • +
  • engine (str) –

      +
    • pn - Peter Norvig’s algorithm [1] (default)

    • +
    • phunspell - A spell checker utilizing spylls, a port of Hunspell.

    • +
    • symspellpy - symspellpy is a Python port of SymSpell v6.5.

    • +
    • tltk - wrapper for TLTK.

    • +
    +

  • +
+
+
Returns:
+

list of possible correct words within 1 or 2 edit distance and +sorted by frequency of word occurrences in the spelling dictionary +in descending order.

+
+
Return type:
+

list[str]

+
+
Example:
+

+
+
from pythainlp.spell import spell
+
+spell("เส้นตรบ",  engine="pn")
+# output: ['เส้นตรง']
+
+spell("เส้นตรบ")
+# output: ['เส้นตรง']
+
+spell("เส้นตรบ",  engine="tltk")
+# output: ['เส้นตรง']
+
+spell("ครัช")
+# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน', 'วรัช', 'ครัส',
+# 'ปรัช', 'บรัช', 'ครัง', 'คัช', 'คลัช', 'ครัย', 'ครัด']
+
+spell("กระปิ")
+# output: ['กะปิ', 'กระบิ']
+
+spell("สังเกตุ")
+# output:  ['สังเกต']
+
+spell("เหตการณ")
+# output:  ['เหตุการณ์']
+
+
+
+ +

The spell function is responsible for identifying spelling errors within a given Thai word. It checks whether the input word is spelled correctly or not and returns a Boolean result. This function is useful for validating the correctness of Thai words.

+
+
+

spell_sent

+
+
+pythainlp.spell.spell_sent(list_words: List[str], engine: str = 'pn') List[List[str]][source]
+

Provides a list of possible correct spellings of sentence

+
+
Parameters:
+
    +
  • list_words (List[str]) – list of words in sentence

  • +
  • engine (str) –

      +
    • pn - Peter Norvig’s algorithm [1] (default)

    • +
    • phunspell - A spell checker utilizing spylls, a port of Hunspell.

    • +
    • symspellpy - symspellpy is a Python port of SymSpell v6.5.

    • +
    +

  • +
+
+
Returns:
+

list of possibly correct words

+
+
Return type:
+

List[List[str]]

+
+
Example:
+

+
+
from pythainlp.spell import spell_sent
+
+spell_sent(["เด็","อินอร์เน็ต","แรง"],engine='symspellpy')
+# output: [['เด็ก', 'อินเทอร์เน็ต', 'แรง']]
+
+
+
+ +

The spell_sent function extends the spell-checking functionality to entire sentences. It tokenizes the input sentence and checks the spelling of each word. It returns a list of Booleans indicating whether each word in the sentence is spelled correctly or not.

+
+
+

NorvigSpellChecker

+
+
+class pythainlp.spell.NorvigSpellChecker(custom_dict: ~typing.Dict[str, int] | ~typing.Iterable[str] | ~typing.Iterable[~typing.Tuple[str, int]] | None = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: ~typing.Callable[[str], bool] | None = <function _is_thai_and_not_num>)[source]
+
+
+__init__(custom_dict: ~typing.Dict[str, int] | ~typing.Iterable[str] | ~typing.Iterable[~typing.Tuple[str, int]] | None = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: ~typing.Callable[[str], bool] | None = <function _is_thai_and_not_num>)[source]
+

Initializes Peter Norvig’s spell checker object. +Spelling dictionary can be customized. +By default, spelling dictionary is from +Thai National Corpus

+

Basically, Norvig’s spell checker will choose the most likely +corrected spelling given a word by searching for candidates of +corrected words based on edit distance. +Then, it selects the candidate with +the highest word occurrence probability.

+
+
Parameters:
+
    +
  • custom_dict (str) –

    A custom spelling dictionary. This can be: +(1) a dictionary (dict), with words (str)

    +
    +

    as keys and frequencies (int) as values;

    +
    +
      +
    1. an iterable (list, tuple, or set) of words +(str) and frequency (int) tuples: +(str, int); or

    2. +
    3. an iterable of just words (str), without +frequencies – in this case 1 will be +assigned to every words.

    4. +
    +

    Default is from Thai National Corpus (around +40,000 words).

    +

  • +
  • min_freq (int) – Minimum frequency of a word to keep (default = 2)

  • +
  • min_len (int) – Minimum length (in characters) of a word to keep +(default = 2)

  • +
  • max_len (int) – Maximum length (in characters) of a word to keep +(default = 40)

  • +
  • dict_filter (func) – A function to filter the dictionary. +Default filter removes any word +with numbers or non-Thai characters. +If no filter is required, use None.

  • +
+
+
+
+ +
+
+dictionary() ItemsView[str, int][source]
+

Returns the spelling dictionary currently used by this spell checker

+
+
Returns:
+

spelling dictionary of this instance

+
+
Return type:
+

list[tuple[str, int]]

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+dictionary= [("หวาน", 30), ("มะนาว", 2), ("แอบ", 3223)]
+
+checker = NorvigSpellChecker(custom_dict=dictionary)
+checker.dictionary()
+# output: dict_items([('หวาน', 30), ('มะนาว', 2), ('แอบ', 3223)])
+
+
+
+ +
+
+known(words: Iterable[str]) List[str][source]
+

Returns a list of given words found in the spelling dictionary

+
+
Parameters:
+

words (list[str]) – A list of words to check if they exist +in the spelling dictionary

+
+
Returns:
+

intersection of the given word list and words +in the spelling dictionary

+
+
Return type:
+

list[str]

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+checker = NorvigSpellChecker()
+
+checker.known(["เพยน", "เพล", "เพลง"])
+# output: ['เพล', 'เพลง']
+
+checker.known(['ยกไ', 'ไฟล์ม'])
+# output: []
+
+checker.known([])
+# output: []
+
+
+
+ +
+
+prob(word: str) float[source]
+

Returns the probability of an input word, +according to the spelling dictionary

+
+
Parameters:
+

word (str) – A word to check occurrence probability of

+
+
Returns:
+

word occurrence probability

+
+
Return type:
+

float

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+checker = NorvigSpellChecker()
+
+checker.prob("ครัช")
+# output: 0.0
+
+checker.prob("รัก")
+# output: 0.0006959172792052158
+
+checker.prob("น่ารัก")
+# output: 9.482306849763902e-05
+
+
+
+ +
+
+freq(word: str) int[source]
+

Returns the frequency of an input word, +according to the spelling dictionary

+
+
Parameters:
+

word (str) – A word to check frequency of

+
+
Returns:
+

frequency of the given word in the spelling dictionary

+
+
Return type:
+

int

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+checker = NorvigSpellChecker()
+
+checker.freq("ปัญญา")
+# output: 3639
+
+checker.freq("บิญชา")
+# output: 0
+
+
+
+ +
+
+spell(word: str) List[str][source]
+

Returns a list of all correctly-spelled words whose spelling +is similar to the given word by edit distance metrics. +The returned list of words will be sorted by decreasing +order of word frequencies in the word spelling dictionary.

+

First, if the input word is spelled correctly, +this method returns a list of exactly one word which is itself. +Next, this method looks for a list of all correctly spelled words +whose edit distance value is 1 from the input word. +If there is no such word, then the search expands to +a list of words whose edit distance value is 2. +And if that still fails, the list of input words is returned.

+
+
Parameters:
+

word (str) – A word to check spelling of

+
+
Returns:
+

list of possibly correct words within 1 or 2 edit distance +and sorted by frequency of word occurrence in the +spelling dictionary in descending order.

+
+
Return type:
+

list[str]

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+checker = NorvigSpellChecker()
+
+checker.spell("เส้นตรบ")
+# output: ['เส้นตรง']
+
+checker.spell("ครัช")
+# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน',
+# 'วรัช', 'ครัส', 'ปรัช', 'บรัช', 'ครัง',
+#'คัช', 'คลัช', 'ครัย', 'ครัด']
+
+
+
+ +
+
+correct(word: str) str[source]
+

Returns the most possible word, using the probability from +the spelling dictionary

+
+
Parameters:
+

word (str) – A word to correct spelling of

+
+
Returns:
+

the correct spelling of the given word

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.spell import NorvigSpellChecker
+
+checker = NorvigSpellChecker()
+
+checker.correct("ปัญชา")
+# output: 'ปัญหา'
+
+checker.correct("บิญชา")
+# output: 'บัญชา'
+
+checker.correct("มิตรภาบ")
+# output: 'มิตรภาพ'
+
+
+
+ +
+
+__dict__ = mappingproxy({'__module__': 'pythainlp.spell.pn', '__init__': <function NorvigSpellChecker.__init__>, 'dictionary': <function NorvigSpellChecker.dictionary>, 'known': <function NorvigSpellChecker.known>, 'prob': <function NorvigSpellChecker.prob>, 'freq': <function NorvigSpellChecker.freq>, 'spell': <function NorvigSpellChecker.spell>, 'correct': <function NorvigSpellChecker.correct>, '__dict__': <attribute '__dict__' of 'NorvigSpellChecker' objects>, '__weakref__': <attribute '__weakref__' of 'NorvigSpellChecker' objects>, '__doc__': None, '__annotations__': {}})
+
+ +
+
+__module__ = 'pythainlp.spell.pn'
+
+ +
+ +

The NorvigSpellChecker class is a fundamental component of the pythainlp.spell module. It implements a spell-checking algorithm based on the work of Peter Norvig. This class is designed for more advanced spell-checking and provides customizable settings for spell correction.

+
+
+

DEFAULT_SPELL_CHECKER

+
+
+pythainlp.spell.DEFAULT_SPELL_CHECKER = Default instance of the standard NorvigSpellChecker, using word list data from the Thai National Corpus: http://www.arts.chula.ac.th/ling/tnc/
+
+ +

The DEFAULT_SPELL_CHECKER is an instance of the NorvigSpellChecker class with default settings. It is pre-configured to use word list data from the Thai National Corpus, making it a reliable choice for general spell-checking tasks.

+
+
+
+

References

+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/summarize.html b/5.1/api/summarize.html new file mode 100644 index 0000000..96f4a0a --- /dev/null +++ b/5.1/api/summarize.html @@ -0,0 +1,504 @@ + + + + + + + + + pythainlp.summarize — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.summarize

+

The summarize is Thai text summarizer.

+
+

Modules

+
+
+pythainlp.summarize.summarize(text: str, n: int = 1, engine: str = 'frequency', tokenizer: str = 'newmm') List[str][source]
+

This function summarizes text based on frequency of words.

+

Under the hood, this function first tokenizes sentences from the given +text with pythainlp.tokenize.sent_tokenize(). +Then, it computes frequencies of tokenized words +(with pythainlp.tokenize.word_tokenize()) in all sentences +and normalizes them with maximum word frequency. The words with normalized +frequencies that are less than 0.1 or greater than 0.9 will be +filtered out from frequency dictionary. Finally, it picks n sentences +with highest sum of normalized frequency from all words which are +in the sentence and also appear in the frequency dictionary.

+
+
Parameters:
+
    +
  • text (str) – text to be summarized

  • +
  • n (int) – number of sentences to be included in the summary +By default, n is 1 (effective for frequency engine only)

  • +
  • engine (str) – text summarization engine (By default: frequency).

  • +
  • tokenizer (str) – word tokenizer engine name (refer to +pythainlp.tokenize.word_tokenize()). +By default, tokenizer is newmm +(effective for frequency engine only)

  • +
+
+
Returns:
+

list of selected sentences

+
+
+
+
Options for engine
    +
  • frequency (default) - frequency of words

  • +
  • mt5 - mT5-small model

  • +
  • mt5-small - mT5-small model

  • +
  • mt5-base - mT5-base model

  • +
  • mt5-large - mT5-large model

  • +
  • mt5-xl - mT5-xl model

  • +
  • mt5-xxl - mT5-xxl model

  • +
  • mt5-cpe-kmutt-thai-sentence-sum - mT5 Thai sentence summarization by CPE KMUTT

  • +
+
+
+
+
Example:
+

+
+
from pythainlp.summarize import summarize
+
+text = '''
+        ทำเนียบท่าช้าง หรือ วังถนนพระอาทิตย์
+        ตั้งอยู่บนถนนพระอาทิตย์ เขตพระนคร กรุงเทพมหานคร
+        เดิมเป็นบ้านของเจ้าพระยามหาโยธา (ทอเรียะ คชเสนี)
+        บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์ (พญาเจ่ง)
+        ต้นสกุลคชเสนี เชื้อสายมอญ เจ้าพระยามหาโยธา (ทอเรีย)
+        เป็นปู่ของเจ้าจอมมารดากลิ่นในพระบาทสมเด็จพระจอมเกล้าเจ้าอยู่หัว
+        และเป็นมรดกตกทอดมาถึง พระเจ้าบรมวงศ์เธอ กรมพระนเรศรวรฤทธิ์
+        (พระองค์เจ้ากฤดาภินิหาร)
+        ต่อมาในรัชสมัยพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัวโปรดเกล้าฯ
+        ให้สร้างตำหนัก 2 ชั้น
+        เป็นที่ประทับของพระเจ้าบรมวงศ์เธอ
+        กรมพระนเรศวรฤทิธิ์และเจ้าจอมมารดา
+        ต่อมาเรียกอาคารหลักนี้ว่า ตำหนักเดิม
+    '''
+
+summarize(text, n=1)
+# output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์']
+
+summarize(text, n=3)
+# output: ['บุตรเจ้าพระยามหาโยธานราธิบดีศรีพิชัยณรงค์',
+# 'เดิมเป็นบ้านของเจ้าพระยามหาโยธา',
+# 'เจ้าพระยามหาโยธา']
+
+summarize(text, engine="mt5-small")
+# output: ['<extra_id_0> ท่าช้าง หรือ วังถนนพระอาทิตย์
+# เขตพระนคร กรุงเทพมหานคร ฯลฯ ดังนี้:
+# ที่อยู่ - ศิลปวัฒนธรรม']
+
+text = "ถ้าพูดถึงขนมหวานในตำนานที่ชื่นใจที่สุดแล้วละก็ต้องไม่พ้น น้ำแข็งใส แน่ๆ เพราะว่าเป็นอะไรที่ชื่นใจสุดๆ"
+summarize(text, engine="mt5-cpe-kmutt-thai-sentence-sum")
+# output: ['น้ําแข็งใสเป็นอะไรที่ชื่นใจที่สุด']
+
+
+
+ +
+
+pythainlp.summarize.extract_keywords(text: str, keyphrase_ngram_range: Tuple[int, int] = (1, 2), max_keywords: int = 5, min_df: int = 1, engine: str = 'keybert', tokenizer: str = 'newmm', stop_words: Iterable[str] | None = None) List[str][source]
+

This function returns most-relevant keywords (and/or keyphrases) from the input document. +Each algorithm may produce completely different keywords from each other, +so please be careful when choosing the algorithm.

+

Note: Calling :func: extract_keywords() is expensive. For repetitive use of KeyBERT (the default engine), +creating KeyBERT object is highly recommended.

+
+
Parameters:
+
    +
  • text (str) – text to be summarized

  • +
  • keyphrase_ngram_range (Tuple[int, int]) – Number of token units to be defined as keyword. +The token unit varies w.r.t. tokenizer_engine. +For instance, (1, 1) means each token (unigram) can be a keyword (e.g. “เสา”, “ไฟฟ้า”), +(1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords +(e.g. “เสา”, “ไฟฟ้า”, “เสาไฟฟ้า”) (default: (1, 2))

  • +
  • max_keywords (int) – Number of maximum keywords to be returned. (default: 5)

  • +
  • min_df (int) – Minimum frequency required to be a keyword. (default: 1)

  • +
  • engine (str) – Name of algorithm to use for keyword extraction. (default: ‘keybert’)

  • +
  • tokenizer (str) – Name of tokenizer engine to use. +Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: ‘newmm’)

  • +
  • stop_words (Optional[Iterable[str]]) – A list of stop words (a.k.a words to be ignored). +If not specified, pythainlp.corpus.thai_stopwords() is used. (default: None)

  • +
+
+
Returns:
+

list of keywords

+
+
+
+
Options for engine
    +
  • keybert (default) - KeyBERT keyword extraction algorithm

  • +
  • frequency - frequency of words

  • +
+
+
+
+
Example:
+

+
+
from pythainlp.summarize import extract_keywords
+
+text = '''
+    อาหาร หมายถึง ของแข็งหรือของเหลว
+    ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว
+    จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย
+    ทำให้ร่างกายเจริญเติบโต
+    ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย
+    ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ
+    อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย
+'''
+
+keywords = extract_keywords(text)
+
+# output: ['อวัยวะต่างๆ',
+# 'ซ่อมแซมส่วน',
+# 'เจริญเติบโต',
+# 'ควบคุมการเปลี่ยนแปลง',
+# 'มีพิษ']
+
+keywords = extract_keywords(text, max_keywords=10)
+
+# output: ['อวัยวะต่างๆ',
+# 'ซ่อมแซมส่วน',
+# 'เจริญเติบโต',
+# 'ควบคุมการเปลี่ยนแปลง',
+# 'มีพิษ',
+# 'ทำให้ร่างกาย',
+# 'ร่างกายเจริญเติบโต',
+# 'จะทำให้เกิด',
+# 'มีพิษและ',
+# 'เกิดโทษ']
+
+
+
+ +
+
+

Keyword Extraction Engines

+
+

KeyBERT

+

Minimal re-implementation of KeyBERT.

+

KeyBERT is a minimal and easy-to-use keyword extraction technique +that leverages BERT embeddings to create keywords and keyphrases +that are most similar to a document.

+

https://github.com/MaartenGr/KeyBERT

+
+
+class pythainlp.summarize.keybert.KeyBERT(model_name: str = 'airesearch/wangchanberta-base-att-spm-uncased')[source]
+
+
+__init__(model_name: str = 'airesearch/wangchanberta-base-att-spm-uncased')[source]
+
+ +
+
+extract_keywords(text: str, keyphrase_ngram_range: Tuple[int, int] = (1, 2), max_keywords: int = 5, min_df: int = 1, tokenizer: str = 'newmm', return_similarity=False, stop_words: Iterable[str] | None = None) List[str] | List[Tuple[str, float]][source]
+

Extract Thai keywords and/or keyphrases with KeyBERT algorithm. +See https://github.com/MaartenGr/KeyBERT.

+
+
Parameters:
+
    +
  • text (str) – text to be summarized

  • +
  • keyphrase_ngram_range (Tuple[int, int]) – Number of token units to be defined as keyword. +The token unit varies w.r.t. tokenizer_engine. +For instance, (1, 1) means each token (unigram) can be a keyword (e.g. “เสา”, “ไฟฟ้า”), +(1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords +(e.g. “เสา”, “ไฟฟ้า”, “เสาไฟฟ้า”) (default: (1, 2))

  • +
  • max_keywords (int) – Number of maximum keywords to be returned. (default: 5)

  • +
  • min_df (int) – Minimum frequency required to be a keyword. (default: 1)

  • +
  • tokenizer (str) – Name of tokenizer engine to use. +Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: ‘newmm’)

  • +
  • return_similarity (bool) – If True, return keyword scores. (default: False)

  • +
  • stop_words (Optional[Iterable[str]]) – A list of stop words (a.k.a words to be ignored). +If not specified, pythainlp.corpus.thai_stopwords() is used. (default: None)

  • +
+
+
Returns:
+

list of keywords with score

+
+
Example:
+

+
+
from pythainlp.summarize.keybert import KeyBERT
+
+text = '''
+    อาหาร หมายถึง ของแข็งหรือของเหลว
+    ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว
+    จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย
+    ทำให้ร่างกายเจริญเติบโต
+    ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย
+    ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ
+    อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย
+'''
+
+kb = KeyBERT()
+
+keywords = kb.extract_keyword(text)
+
+# output: ['อวัยวะต่างๆ',
+# 'ซ่อมแซมส่วน',
+# 'เจริญเติบโต',
+# 'ควบคุมการเปลี่ยนแปลง',
+# 'มีพิษ']
+
+keywords = kb.extract_keyword(text, max_keywords=10, return_similarity=True)
+
+# output: [('อวัยวะต่างๆ', 0.3228477063109462),
+# ('ซ่อมแซมส่วน', 0.31320597838000375),
+# ('เจริญเติบโต', 0.29115434699705506),
+# ('ควบคุมการเปลี่ยนแปลง', 0.2678430841321016),
+# ('มีพิษ', 0.24996827960821494),
+# ('ทำให้ร่างกาย', 0.23876962942443258),
+# ('ร่างกายเจริญเติบโต', 0.23191285218852364),
+# ('จะทำให้เกิด', 0.22425422716846247),
+# ('มีพิษและ', 0.22162962875299588),
+# ('เกิดโทษ', 0.20773497763458507)]
+
+
+
+ +
+
+embed(docs: str | List[str]) ndarray[source]
+

Create an embedding of each input in docs by averaging vectors from the last hidden layer.

+
+ +
+ +
+
+class pythainlp.summarize.keybert.KeyBERT(model_name: str = 'airesearch/wangchanberta-base-att-spm-uncased')[source]
+
+
+__init__(model_name: str = 'airesearch/wangchanberta-base-att-spm-uncased')[source]
+
+ +
+
+extract_keywords(text: str, keyphrase_ngram_range: Tuple[int, int] = (1, 2), max_keywords: int = 5, min_df: int = 1, tokenizer: str = 'newmm', return_similarity=False, stop_words: Iterable[str] | None = None) List[str] | List[Tuple[str, float]][source]
+

Extract Thai keywords and/or keyphrases with KeyBERT algorithm. +See https://github.com/MaartenGr/KeyBERT.

+
+
Parameters:
+
    +
  • text (str) – text to be summarized

  • +
  • keyphrase_ngram_range (Tuple[int, int]) – Number of token units to be defined as keyword. +The token unit varies w.r.t. tokenizer_engine. +For instance, (1, 1) means each token (unigram) can be a keyword (e.g. “เสา”, “ไฟฟ้า”), +(1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords +(e.g. “เสา”, “ไฟฟ้า”, “เสาไฟฟ้า”) (default: (1, 2))

  • +
  • max_keywords (int) – Number of maximum keywords to be returned. (default: 5)

  • +
  • min_df (int) – Minimum frequency required to be a keyword. (default: 1)

  • +
  • tokenizer (str) – Name of tokenizer engine to use. +Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: ‘newmm’)

  • +
  • return_similarity (bool) – If True, return keyword scores. (default: False)

  • +
  • stop_words (Optional[Iterable[str]]) – A list of stop words (a.k.a words to be ignored). +If not specified, pythainlp.corpus.thai_stopwords() is used. (default: None)

  • +
+
+
Returns:
+

list of keywords with score

+
+
Example:
+

+
+
from pythainlp.summarize.keybert import KeyBERT
+
+text = '''
+    อาหาร หมายถึง ของแข็งหรือของเหลว
+    ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว
+    จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย
+    ทำให้ร่างกายเจริญเติบโต
+    ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย
+    ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ
+    อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย
+'''
+
+kb = KeyBERT()
+
+keywords = kb.extract_keyword(text)
+
+# output: ['อวัยวะต่างๆ',
+# 'ซ่อมแซมส่วน',
+# 'เจริญเติบโต',
+# 'ควบคุมการเปลี่ยนแปลง',
+# 'มีพิษ']
+
+keywords = kb.extract_keyword(text, max_keywords=10, return_similarity=True)
+
+# output: [('อวัยวะต่างๆ', 0.3228477063109462),
+# ('ซ่อมแซมส่วน', 0.31320597838000375),
+# ('เจริญเติบโต', 0.29115434699705506),
+# ('ควบคุมการเปลี่ยนแปลง', 0.2678430841321016),
+# ('มีพิษ', 0.24996827960821494),
+# ('ทำให้ร่างกาย', 0.23876962942443258),
+# ('ร่างกายเจริญเติบโต', 0.23191285218852364),
+# ('จะทำให้เกิด', 0.22425422716846247),
+# ('มีพิษและ', 0.22162962875299588),
+# ('เกิดโทษ', 0.20773497763458507)]
+
+
+
+ +
+
+embed(docs: str | List[str]) ndarray[source]
+

Create an embedding of each input in docs by averaging vectors from the last hidden layer.

+
+ +
+ +
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/tag.html b/5.1/api/tag.html new file mode 100644 index 0000000..a41fcee --- /dev/null +++ b/5.1/api/tag.html @@ -0,0 +1,1086 @@ + + + + + + + + + pythainlp.tag — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.tag

+

The pythainlp.tag contains functions that are used to mark linguistic and other annotation to different parts of a text including +part-of-speech (POS) tags and named entity (NE) tags.

+

For POS tags, there are three sets of available tags: Universal POS tags, ORCHID POS tags [1], and LST20 POS tags [2].

+

The following table shows Universal POS tags as used in Universal Dependencies (UD):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Abbreviation

Part-of-Speech tag

Examples

ADJ

Adjective

ใหม่, พิเศษ , ก่อน, มาก, สูง

ADP

Adposition

แม้, ว่า, เมื่อ, ของ, สำหรับ

ADV

Adverb

ก่อน, ก็, เล็กน้อย, เลย, สุด

AUX

Auxiliary

เป็น, ใช่, คือ, คล้าย

CCONJ

Coordinating conjunction

แต่, และ, หรือ

DET

Determiner

ที่, นี้, ซึ่ง, ทั้ง, ทุก, หลาย

INTJ

Interjection

อุ้ย, โอ้ย

NOUN

Noun

กำมือ, พวก, สนาม, กีฬา, บัญชี

NUM

Numeral

5,000, 103.7, 2004, หนึ่ง, ร้อย

PART

Particle

มา ขึ้น ไม่ ได้ เข้า

PRON

Pronoun

เรา, เขา, ตัวเอง, ใคร, เธอ

PROPN

Proper noun

โอบามา, แคปิตอลฮิล, จีโอพี, ไมเคิล

PUNCT

Punctuation

(, ), “, ‘, :

SCONJ

Subordinating conjunction

หาก

VERB

Verb

เปิด, ให้, ใช้, เผชิญ, อ่าน

+

The following table shows POS tags as used in ORCHID:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Abbreviation

Part-of-Speech tag

Examples

NPRP

Proper noun

วินโดวส์ 95, โคโรน่า, โค้ก

NCNM

Cardinal number

หนึ่ง, สอง, สาม, 1, 2, 10

NONM

Ordinal number

ที่หนึ่ง, ที่สอง, ที่สาม, ที่1, ที่2

NLBL

Label noun

1, 2, 3, 4, ก, ข, a, b

NCMN

Common noun

หนังสือ, อาหาร, อาคาร, คน

NTTL

Title noun

ครู, พลเอก

PPRS

Personal pronoun

คุณ, เขา, ฉัน

PDMN

Demonstrative pronoun

นี่, นั้น, ที่นั่น, ที่นี่

PNTR

Interrogative pronoun

ใคร, อะไร, อย่างไร

PREL

Relative pronoun

ที่, ซึ่ง, อัน, ผู้

VACT

Active verb

ทำงาน, ร้องเพลง, กิน

VSTA

Stative verb

เห็น, รู้, คือ

VATT

Attributive verb

อ้วน, ดี, สวย

XVBM

Pre-verb auxiliary, before negator “ไม่”

เกิด, เกือบ, กำลัง

XVAM

Pre-verb auxiliary, after negator “ไม่”

ค่อย, น่า, ได้

XVMM

Pre-verb, before or after negator “ไม่”

ควร, เคย, ต้อง

XVBB

Pre-verb auxiliary, in imperative mood

กรุณา, จง, เชิญ, อย่า, ห้าม

XVAE

Post-verb auxiliary

ไป, มา, ขึ้น

DDAN

+
Definite determiner, after noun without
+
classifier in between
+
+

ยี่, นั่น, โน่น, ทั้งหมด

DDAC

+
Definite determiner, allowing classifier
+
in between
+
+

นี้, นั้น, โน้น, นู้น

DDBQ

+
Definite determiner, between noun and
+
classifier or preceding quantitative expression
+
+

ทั้ง, อีก, เพียง

DDAQ

+
Definite determiner,
+
following quantitative expression
+
+

พอดี, ถ้วน

DIAC

+
Indefinite determiner, following noun; allowing
+
classifier in between
+
+

ไหน, อื่น, ต่างๆ

DIBQ

+
Indefinite determiner, between noun and
+
classifier or preceding quantitative expression
+
+

บาง, ประมาณ, เกือบ

DIAQ

+
Indefinite determiner,
+
following quantitative expression
+
+

กว่า, เศษ

DCNM

Determiner, cardinal number expression

หนึ่งคน, เสือ, 2 ตัว

DONM

Determiner, ordinal number expression

ที่หนึ่ง, ที่สอง, ที่สุดท้สย

ADVN

Adverb with normal form

เก่ง, เร็ว, ช้า, สม่ำเสมอ

ADVI

Adverb with iterative form

เร็วๆ, เสทอๆ, ช้าๆ

ADVP

Adverb with prefixed form

โดยเร็ว

ADVS

Sentential adverb

โดยปกติ, ธรรมดา

CNIT

Unit classifier

ตัว, คน, เล่ม

CLTV

Collective classifier

+
คู่, กลุ่ม, ฝูง, เชิง, ทาง,
+
ด้าน, แบบ, รุ่น
+
+

CMTR

Measurement classifier

กิโลกรัม, แก้ว, ชั่วโมง

CFQC

Frequency classifier

ครั้ง, เที่ยว

CVBL

Verbal classifier

ม้วน, มัด

JCRG

Coordinating conjunction

และ, หรือ, แต่

JCMP

Comparative conjunction

กว่า, เหมือนกับ, เท่ากับ

JSBR

Subordinating conjunction

เพราะว่า, เนื่องจาก ที่, แม้ว่า, ถ้า

RPRE

Preposition

จาก, ละ, ของ, ใต้, บน

INT

Interjection

โอ้บ, โอ้, เออ, เอ๋, อ๋อ

FIXN

Nominal prefix

การทำงาน, ความสนุนสนาน

FIXV

Adverbial prefix

อย่างเร็ว

EAFF

Ending for affirmative sentence

จ๊ะ, จ้ะ, ค่ะ, ครับ, นะ, น่า, เถอะ

EITT

Ending for interrogative sentence

หรือ, เหรอ, ไหม, มั้ย

NEG

Negator

ไม่, มิได้, ไม่ได้, มิ

PUNC

Punctuation

(, ), “, ,, ;

+

ORCHID corpus uses a different set of POS tags. Thus, we make UD POS tags version for ORCHID corpus.

+

The following table shows the mapping of POS tags from ORCHID to UD:

+

Details about LST20 POS tags are available in [2].

+

The following table shows the mapping of POS tags from LST20 to UD:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

LST20 POS tags

Corresponding UD POS tag

AJ

ADJ

AV

ADV

AX

AUX

CC

CCONJ

CL

NOUN

FX

NOUN

IJ

INTJ

NN

NOUN

NU

NUM

PA

PART

PR

PROPN

PS

ADP

PU

PUNCT

VV

VERB

XX

X

+

For the NE, we use Inside-outside-beginning (IOB) format to tag NE for each word.

+

B- prefix indicates the beginning token of the chunk. I- prefix indicates the intermediate token within the chunk. O indicates that the token does not belong to any NE chunk.

+

For instance, given a sentence “บารัค โอบามาเป็นประธานธิปดี”, it would tag the tokens “บารัค”, “โอบามา”, “เป็น”, “ประธานาธิปดี” with “B-PERSON”, “I-PERSON”, “O”, and “O” respectively.

+

The following table shows named entity (NE) tags as used in PyThaiNLP:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Named Entity tag

Examples

DATE

2/21/2004, 16 ก.พ., จันทร์

TIME

16.30 น., 5 วัน, 1-3 ปี

EMAIL

info@nrpsc.ac.th

LEN

30 กิโลเมตร, 5 กม.

LOCATION

ไทย, จ.ปราจีนบุรี, กำแพงเพชร

ORGANIZATION

กรมวิทยาศาสตร์การแพทย์, อย.

PERSON

น.พ.จรัล, นางประนอม ทองจันทร์

PHONE

1200, 0 2670 8888

URL

http://www.bangkokhealth.com/

ZIP

10400, 11130

Money

2.7 ล้านบาท, 2,000 บาท

LAW

พ.ร.บ.โรคระบาด พ.ศ.2499, รัฐธรรมนูญ

+
+

Modules

+
+
+pythainlp.tag.pos_tag(words: List[str], engine: str = 'perceptron', corpus: str = 'orchid') List[Tuple[str, str]][source]
+

Marks words with part-of-speech (POS) tags, such as ‘NOUN’ and ‘VERB’.

+
+
Parameters:
+
    +
  • words (list) – a list of tokenized words

  • +
  • engine (str) –

      +
    • perceptron - perceptron tagger (default)

    • +
    • unigram - unigram tagger

    • +
    • wangchanberta - wangchanberta model.

    • +
    • tltk - TLTK: Thai Language Toolkit (support TNC corpora only. If you choose other corpora, they will be converted to TNC corpora.)

    • +
    +

  • +
  • corpus (str) –

    the corpus that is used to create the language model for tagger +* orchid - ORCHID corpus, text from Thai academic articles (default) +* orchid_ud - ORCHID text, with tags mapped to Universal POS tags +* blackboard - blackboard treebank +* blackboard_ud - blackboard text, with tags mapped to Universal POS tag from Universal Dependencies <https://universaldependencies.org/> +* pud - Parallel Universal Dependencies (PUD) treebanks, natively use Universal POS tags +* tdtb - Thai Discourse Treebank , natively use Universal POS tags +* tnc - Thai National Corpus (support tltk engine only) +* tdtb - Thai Discourse Treebank +* tud - Thai Universal Dependency Treebank (TUD) :return: a list of tuples (word, POS tag)

    +

  • +
+
+
Return type:
+

list[tuple[str, str]]

+
+
Example:
+

+
+

Tag words with corpus orchid (default):

+
from pythainlp.tag import pos_tag
+
+words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \
+    'นายก', 'เชอร์ชิล']
+pos_tag(words)
+# output:
+# [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'),
+#   ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'),
+#   ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')]
+
+
+

Tag words with corpus orchid_ud:

+
from pythainlp.tag import pos_tag
+
+words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \
+    'นายก', 'เชอร์ชิล']
+pos_tag(words, corpus='orchid_ud')
+# output:
+# [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'),
+#   ('รอด', 'NOUN'), ('ใน', 'ADP'),  ('อาคาร', 'NOUN'),
+#   ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'),
+#   ('เชอร์ชิล', 'NOUN')]
+
+
+

Tag words with corpus pud:

+
from pythainlp.tag import pos_tag
+
+words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \
+    'นายก', 'เชอร์ชิล']
+pos_tag(words, corpus='pud')
+# [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'),
+#   ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'),
+#   ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')]
+
+
+

Tag words with different engines including perceptron and unigram:

+
from pythainlp.tag import pos_tag
+
+words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3']
+
+pos_tag(words, engine='perceptron', corpus='orchid')
+# output:
+# [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'),
+#   ('ขา', 'NCMN'), (' ', 'PUNC'),
+#   ('=', 'PUNC'), ('3', 'NCNM')]
+
+pos_tag(words, engine='unigram', corpus='pud')
+# output:
+# [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
+#   ('<space>', None), ('<equal>', None), ('3', 'NUM')]
+
+
+
+ +
+
+pythainlp.tag.pos_tag_sents(sentences: List[List[str]], engine: str = 'perceptron', corpus: str = 'orchid') List[List[Tuple[str, str]]][source]
+

Marks sentences with part-of-speech (POS) tags.

+
+
Parameters:
+
    +
  • sentences (list) – a list of lists of tokenized words

  • +
  • engine (str) –

      +
    • perceptron - perceptron tagger (default)

    • +
    • unigram - unigram tagger

    • +
    • tltk - TLTK: Thai Language Toolkit (support TNC corpus only. If you choose other corpora, they will be converted to TNC corpora.)

    • +
    +

  • +
  • corpus (str) –

    the corpus that is used to create the language model for tagger +* orchid - ORCHID corpus, text from Thai academic articles (default) +* orchid_ud - ORCHID text, with tags mapped to Universal POS tags +* blackboard - blackboard treebank +* blackboard_ud - blackboard text, with tags mapped to Universal POS tag from Universal Dependencies <https://universaldependencies.org/> +* pud - Parallel Universal Dependencies (PUD) treebanks, natively use Universal POS tags +* tnc - Thai National Corpus (support tltk engine only)

    +

  • +
+
+
Returns:
+

a list of lists of tuples (word, POS tag)

+
+
Return type:
+

list[list[tuple[str, str]]]

+
+
Example:
+

+
+

Labels POS for two sentences:

+
from pythainlp.tag import pos_tag_sents
+
+sentences = [['เก้าอี้','มี','3','ขา'], \
+                    ['นก', 'บิน', 'กลับ', 'รัง']]
+pos_tag_sents(sentences, corpus='pud)
+# output:
+# [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'),
+#   ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'),
+#   ('กลับ', 'VERB'), ('รัง', 'NOUN')]]
+
+
+
+ +
+
+pythainlp.tag.tag_provinces(tokens: List[str]) List[Tuple[str, str]][source]
+

This function recognizes Thailand provinces in text.

+

Note that it uses exact match and considers no context.

+
+
Parameters:
+

tokens (list[str]) – a list of words

+
+
Returns:
+

a list of tuples indicating NER for LOCATION in IOB format

+
+
Return type:
+

list[tuple[str, str]]

+
+
Example:
+

+
+
from pythainlp.tag import tag_provinces
+
+text = ['หนองคาย', 'น่าอยู่']
+tag_provinces(text)
+# output: [('หนองคาย', 'B-LOCATION'), ('น่าอยู่', 'O')]
+
+
+
+ +
+
+pythainlp.tag.chunk_parse(sent: List[Tuple[str, str]], engine: str = 'crf', corpus: str = 'orchidpp') List[str][source]
+

This function parses Thai sentence to phrase structure in IOB format.

+
+
Parameters:
+
    +
  • sent (list) – list [(word, part-of-speech)]

  • +
  • engine (str) – chunk parse engine (now, it has crf only)

  • +
  • corpus (str) – chunk parse corpus (now, it has orchidpp only)

  • +
+
+
Returns:
+

a list of tuples (word, part-of-speech, chunking)

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.tag import chunk_parse, pos_tag
+
+tokens = ["ผม", "รัก", "คุณ"]
+tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")
+
+print(chunk_parse(tokens_pos))
+# output: ['B-NP', 'B-VP', 'I-VP']
+
+
+
+ +
+
+class pythainlp.tag.NER(engine: str = 'thainer-v2', corpus: str = 'thainer')[source]
+

Class of named-entity recognizer

+
+
Parameters:
+
    +
  • engine (str) – engine of named-entity recognizer

  • +
  • corpus (str) – corpus

  • +
+
+
+
+
Options for engine
    +
  • thainer-v2 - Thai NER engine v2.0 for Thai NER 2.0 (default)

  • +
  • thainer - Thai NER engine

  • +
  • tltk - wrapper for TLTK.

  • +
+
+
Options for corpus
    +
  • thainer - Thai NER corpus (default)

  • +
+
+
+

Note: The tltk engine supports NER models from tltk only.

+
+
+__init__(engine: str = 'thainer-v2', corpus: str = 'thainer') None[source]
+
+ +
+
+load_engine(engine: str, corpus: str) None[source]
+
+ +
+
+tag(text, pos=False, tag=False) List[Tuple[str, str]] | List[Tuple[str, str, str]] | str[source]
+

This function tags named entities in text in IOB format.

+
+
Parameters:
+
    +
  • text (str) – text in Thai to be tagged

  • +
  • pos (bool) – output with part-of-speech tags. (wangchanberta is not supported)

  • +
  • tag (bool) – output HTML-like tags.

  • +
+
+
Returns:
+

a list of tuples associated with tokenized words, NER tags, +POS tags (if the parameter pos is specified as True), +and output HTML-like tags (if the parameter tag is +specified as True). +Otherwise, return a list of tuples associated with tokenized +words and NER tags

+
+
Return type:
+

Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]

+
+
Example:
+
>>> from pythainlp.tag import NER
+>>>
+>>> ner = NER("thainer")
+>>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์")
+[('ทดสอบ', 'O'),
+('นาย', 'B-PERSON'),
+('วรรณ', 'I-PERSON'),
+('พงษ์', 'I-PERSON'),
+(' ', 'I-PERSON'),
+('ภัททิย', 'I-PERSON'),
+('ไพบูลย์', 'I-PERSON')]
+>>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
+'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
+
+
+
+
+
+ +
+ +
+
+class pythainlp.tag.NNER(engine: str = 'thai_nner')[source]
+

Nested Named Entity Recognition

+
+
Parameters:
+
    +
  • engine (str) – engine of nested named entity recognizer

  • +
  • corpus (str) – corpus

  • +
+
+
+
+
Options for engine
    +
  • thai_nner - Thai NER engine

  • +
+
+
+
+
+__init__(engine: str = 'thai_nner') None[source]
+
+ +
+
+load_engine(engine: str = 'thai_nner') None[source]
+
+ +
+
+tag(text) Tuple[List[str], List[dict]][source]
+

This function tags nested named entities.

+
+
Parameters:
+

text (str) – text in Thai to be tagged

+
+
Returns:
+

a list of tuples associated with tokenized words and NNER tags.

+
+
Return type:
+

Tuple[List[str], List[dict]]

+
+
Example:
+
>>> from pythainlp.tag.named_entity import NNER
+>>> nner = NNER()
+>>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า")
+([
+    '<s>',
+    '',
+    'แมว',
+    'ทํา',
+    '',
+    'อะไร',
+    'ตอน',
+    '',
+    'ห้า',
+    '',
+    'โมง',
+    '',
+    'เช้า',
+    '</s>'
+],
+[
+    {
+        'text': ['', 'ห้า'],
+        'span': [7, 9],
+        'entity_type': 'cardinal'
+    },
+    {
+        'text': ['', 'ห้า', '', 'โมง'],
+        'span': [7, 11],
+        'entity_type': 'time'
+    },
+    {
+        'text': ['', 'โมง'],
+        'span': [9, 11],
+        'entity_type': 'unit'
+    }
+])
+
+
+
+
+
+ +
+ +
+
+class pythainlp.tag.thainer.ThaiNameTagger(version: str = '1.4')[source]
+

Thai named-entity recognizer or Thai NER. +This function supports Thai NER 1.4 and 1.5 only. +:param str version: Thai NER version.

+
+

It supports Thai NER 1.4 & 1.5. +The default value is `1.4

+
+
+
Example:
+

+
+
from pythainlp.tag.thainer import ThaiNameTagger
+
+thainer14 = ThaiNameTagger(version="1.4")
+thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
+
+
+
+
+__init__(version: str = '1.4') None[source]
+

Thai named-entity recognizer.

+
+
Parameters:
+

version (str) – Thai NER version. +It’s support Thai NER 1.4 & 1.5. +The default value is 1.4

+
+
+
+ +
+
+get_ner(text: str, pos: bool = True, tag: bool = False) List[Tuple[str, str]] | List[Tuple[str, str, str]][source]
+

This function tags named-entities in text in IOB format.

+
+
Parameters:
+
    +
  • text (str) – text in Thai to be tagged

  • +
  • pos (bool) – To include POS tags in the results (True) or +exclude (False). The default value is True

  • +
  • tag (bool) – output HTML-like tags.

  • +
+
+
Returns:
+

a list of tuples associated with tokenized words, NER tags, +POS tags (if the parameter pos is specified as True), +and output HTML-like tags (if the parameter tag is +specified as True). +Otherwise, return a list of tuples associated with tokenized +words and NER tags

+
+
Return type:
+

Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

+
+
Note:
+
    +
  • For the POS tags to be included in the results, this function +uses pythainlp.tag.pos_tag() with engine perceptron +and corpus orchid_ud.

  • +
+
+
Example:
+
>>> from pythainlp.tag.thainer import ThaiNameTagger
+>>>
+>>> ner = ThaiNameTagger()
+>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
+[('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
+('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
+('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
+('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
+('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
+('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
+('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
+('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
+('น.', 'NOUN', 'I-TIME')]
+>>>
+>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
+                pos=False)
+[('วันที่', 'O'), (' ', 'O'),
+('15', 'B-DATE'), (' ', 'I-DATE'),
+('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
+('61', 'I-DATE'), (' ', 'O'),
+('ทดสอบ', 'O'), ('ระบบ', 'O'),
+('เวลา', 'O'), (' ', 'O'),
+('14', 'B-TIME'), (':', 'I-TIME'),
+('49', 'I-TIME'), (' ', 'I-TIME'),
+('น.', 'I-TIME')]
+>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
+                tag=True)
+'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>
+14:49 น.</TIME>'
+
+
+
+
+
+ +
+ +
+
+

Tagger Engines

+
+

perceptron

+

Perceptron tagger is a part-of-speech tagging using the averaged, structured perceptron algorithm.

+
+
+

unigram

+

Unigram tagger doesn’t take the ordering of words in the list into account.

+
+
+
+

References

+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/tokenize.html b/5.1/api/tokenize.html new file mode 100644 index 0000000..733ee03 --- /dev/null +++ b/5.1/api/tokenize.html @@ -0,0 +1,1404 @@ + + + + + + + + + pythainlp.tokenize — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.tokenize

+

The pythainlp.tokenize module contains a comprehensive set of functions and classes for tokenizing Thai text into various units, such as sentences, words, subwords, and more. This module is a fundamental component of the PyThaiNLP library, providing tools for natural language processing in the Thai language.

+
+

Modules

+
+
+pythainlp.tokenize.sent_tokenize(text: str | List[str], engine: str = 'crfcut', keep_whitespace: bool = True) List[str][source]
+

Sentence tokenizer.

+

Tokenizes running text into “sentences”. Supports both string and list of strings.

+
+
Parameters:
+
    +
  • text – the text (string) or list of words (list of strings) to be tokenized

  • +
  • engine (str) – choose among ‘crfcut’, ‘whitespace’, ‘whitespace+newline’

  • +
+
+
Returns:
+

list of split sentences

+
+
Return type:
+

list[str]

+
+
+
+
Options for engine
    +
  • crfcut - (default) split by CRF trained on TED dataset

  • +
  • thaisum - The implementation of sentence segmenter from Nakhun Chumpolsathien, 2020

  • +
  • tltk - split by TLTK.,

  • +
  • wtp - split by wtpsplitaxe., It supports many sizes of models. You can use wtp to use mini model, wtp-tiny to use wtp-bert-tiny model (default), wtp-mini to use wtp-bert-mini model, wtp-base to use wtp-canine-s-1l model, and wtp-large to use wtp-canine-s-12l model.

  • +
  • whitespace+newline - split by whitespace and newline.

  • +
  • whitespace - split by whitespace, specifically with regex pattern r" +"

  • +
+
+
+
+
Example:
+

+
+

Split the text based on whitespace:

+
from pythainlp.tokenize import sent_tokenize
+
+sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
+sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \
+และได้รับมอบหมายให้ประจำในระดับภูมิภาค"
+
+sent_tokenize(sentence_1, engine="whitespace")
+# output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
+
+sent_tokenize(sentence_2, engine="whitespace")
+# output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
+#   '\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
+
+
+

Split the text based on whitespace and newline:

+
sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
+sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \
+และได้รับมอบหมายให้ประจำในระดับภูมิภาค"
+
+sent_tokenize(sentence_1, engine="whitespace+newline")
+# output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
+sent_tokenize(sentence_2, engine="whitespace+newline")
+# output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
+'\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
+
+
+

Split the text using CRF trained on TED dataset:

+
sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
+sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \
+และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค"
+
+sent_tokenize(sentence_1, engine="crfcut")
+# output: ['ฉันไปประชุมเมื่อวันที่ 11 มีนาคม']
+
+sent_tokenize(sentence_2, engine="crfcut")
+# output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ ',
+'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
+
+
+

Splits Thai text into sentences. This function identifies sentence boundaries, which is essential for text segmentation and analysis.

+
+ +
+
+pythainlp.tokenize.paragraph_tokenize(text: str, engine: str = 'wtp-mini', paragraph_threshold: float = 0.5, style: str = 'newline') List[List[str]][source]
+

Paragraph tokenizer.

+

Tokenizes text into paragraphs.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • engine (str) – the name of paragraph tokenizer

  • +
+
+
Returns:
+

list of paragraphs

+
+
Return type:
+

List[List[str]]

+
+
+
+
Options for engine
    +
  • wtp - split by wtpsplitaxe., It supports many sizes of models. You can use wtp to use mini model, wtp-tiny to use wtp-bert-tiny model (default), wtp-mini to use wtp-bert-mini model, wtp-base to use wtp-canine-s-1l model, and wtp-large to use wtp-canine-s-12l model.

  • +
+
+
+
+
Example:
+

+
+

Split the text based on wtp:

+
from pythainlp.tokenize import paragraph_tokenize
+
+sent = (
+    "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+    +"  มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+    +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+)
+
+paragraph_tokenize(sent)
+# output: [
+# ['(1) '], 
+# [
+#   'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต  ',
+#   'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
+#   'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
+#   'ณ ที่นี้'
+# ]]
+
+
+

Segments text into paragraphs, which can be valuable for document-level analysis or summarization.

+
+ +
+
+pythainlp.tokenize.subword_tokenize(text: str, engine: str = 'tcc', keep_whitespace: bool = True) List[str][source]
+

Subword tokenizer for tokenizing text into units smaller than syllables.

+

Tokenizes text into inseparable units of +Thai contiguous characters, namely +Thai Character Clusters (TCCs) +TCCs are units based on Thai spelling features that could not be +separated any character further such as ‘ก็’, ‘จะ’, ‘ไม่’, and ‘ฝา’. +If the following units are separated, they could not be spelled out. +This function applies TCC rules to tokenize the text into +the smallest units.

+

For example, the word ‘ขนมชั้น’ would be tokenized +into ‘ข’, ‘น’, ‘ม’, and ‘ชั้น’.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • engine (str) – the name of subword tokenizer

  • +
  • keep_whitespace (bool) – keep whitespace

  • +
+
+
Returns:
+

list of subwords

+
+
Return type:
+

List[str]

+
+
+
+
Options for engine
    +
  • dict - newmm word tokenizer with a syllable dictionary

  • +
  • etcc - Enhanced Thai Character Cluster (Inrut et al. 2001)

  • +
  • han_solo - CRF syllable segmenter for Thai that can work in the Thai social media domain. See PyThaiNLP/Han-solo.

  • +
  • ssg - CRF syllable segmenter for Thai. See ponrawee/ssg.

  • +
  • tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)

  • +
  • tcc_p - Thai Character Cluster + improved rules that are used in newmm

  • +
  • tltk - syllable tokenizer from tltk. See tltk.

  • +
  • wangchanberta - SentencePiece from wangchanberta model

  • +
+
+
+
+
Example:
+

+
+

Tokenize text into subwords based on tcc:

+
from pythainlp.tokenize import subword_tokenize
+
+text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
+text_2 = "ความแปลกแยกและพัฒนาการ"
+
+subword_tokenize(text_1, engine='tcc')
+# output: ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก',
+#   'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง',
+#   'ศ', '์', 'ห', 'มิ', 'ง']
+
+subword_tokenize(text_2, engine='tcc')
+# output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
+'และ', 'พัฒ','นา', 'กา', 'ร']
+
+
+

Tokenize text into subwords based on etcc:

+
text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
+text_2 = "ความแปลกแยกและพัฒนาการ"
+
+subword_tokenize(text_1, engine='etcc')
+# output: ['ยุคเริ่มแรกของ ราชวงศ์หมิง']
+
+subword_tokenize(text_2, engine='etcc')
+# output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']
+
+
+

Tokenize text into subwords based on wangchanberta:

+
text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
+text_2 = "ความแปลกแยกและพัฒนาการ"
+
+subword_tokenize(text_1, engine='wangchanberta')
+# output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง']
+
+subword_tokenize(text_2, engine='wangchanberta')
+# output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ']
+
+
+

Tokenizes text into subwords, which can be helpful for various NLP tasks, including subword embeddings.

+
+ +
+
+pythainlp.tokenize.syllable_tokenize(text: str, engine: str = 'han_solo', keep_whitespace: bool = True) List[str][source]
+

Syllable tokenizer

+

Tokenizes text into inseparable units of +Thai syllables.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • engine (str) – the name of syllable tokenizer

  • +
  • keep_whitespace (bool) – keep whitespace

  • +
+
+
Returns:
+

list of subwords

+
+
Return type:
+

List[str]

+
+
+
+
Options for engine
    +
  • dict - newmm word tokenizer with a syllable dictionary

  • +
  • han_solo - CRF syllable segmenter for Thai that can work in the Thai social media domain. See PyThaiNLP/Han-solo.

  • +
  • ssg - CRF syllable segmenter for Thai. See ponrawee/ssg.

  • +
  • tltk - syllable tokenizer from tltk. See tltk.

  • +
+
+
+

Divides text into syllables, allowing you to work with individual Thai language phonetic units.

+
+ +
+
+pythainlp.tokenize.word_tokenize(text: str, custom_dict: ~pythainlp.util.trie.Trie = <pythainlp.util.trie.Trie object>, engine: str = 'newmm', keep_whitespace: bool = True, join_broken_num: bool = True) List[str][source]
+

Word tokenizer.

+

Tokenizes running text into words (list of strings).

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • engine (str) – name of the tokenizer to be used

  • +
  • custom_dict (pythainlp.util.Trie) – dictionary trie (some engine may not support)

  • +
  • keep_whitespace (bool) – True to keep whitespace, a common mark +for end of phrase in Thai. +Otherwise, whitespace is omitted.

  • +
  • join_broken_num (bool) – True to rejoin formatted numeric that could be wrongly separated. +Otherwise, formatted numeric could be wrongly separated.

  • +
+
+
Returns:
+

list of words

+
+
Return type:
+

List[str]

+
+
+
+
Options for engine
    +
  • attacut - wrapper for +AttaCut., +learning-based approach

  • +
  • deepcut - wrapper for +DeepCut, +learning-based approach

  • +
  • icu - wrapper for a word tokenizer in +PyICU., +from ICU (International Components for Unicode), +dictionary-based

  • +
  • longest - dictionary-based, longest matching

  • +
  • mm - “multi-cut”, dictionary-based, maximum matching

  • +
  • nercut - dictionary-based, maximal matching, +constrained by Thai Character Cluster (TCC) boundaries, +combining tokens that are parts of the same named-entity

  • +
  • newmm (default) - “new multi-cut”, +dictionary-based, maximum matching, +constrained by Thai Character Cluster (TCC) boundaries +with improved TCC rules that are used in newmm.

  • +
  • newmm-safe - newmm, with a mechanism to avoid long +processing time for text with continuously ambiguous breaking points

  • +
  • nlpo3 - wrapper for a word tokenizer in +nlpO3., +adaptation of newmm in Rust (2.5x faster)

  • +
  • oskut - wrapper for +OSKut., +Out-of-domain StacKed cut for Word Segmentation

  • +
  • sefr_cut - wrapper for +SEFR CUT., +Stacked Ensemble Filter and Refine for Word Segmentation

  • +
  • tltk - wrapper for +TLTK.,

    +
    +

    maximum collocation approach

    +
    +
  • +
+
+
+
+
Note:
+
    +
  • The custom_dict parameter only works for deepcut, longest, newmm, and newmm-safe engines.

  • +
+
+
Example:
+

+
+

Tokenize text with different tokenizers:

+
from pythainlp.tokenize import word_tokenize
+
+text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
+
+word_tokenize(text, engine="newmm")
+# output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+
+word_tokenize(text, engine='attacut')
+# output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+
+
+

Tokenize text with whitespace omitted:

+
text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว "
+
+word_tokenize(text, engine="newmm")
+# output:
+# ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' ']
+
+word_tokenize(text, engine="newmm", keep_whitespace=False)
+# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
+
+
+

Join broken formatted numeric (e.g. time, decimals, IP addresses):

+
text = "เงิน1,234บาท19:32น 127.0.0.1"
+
+word_tokenize(text, engine="attacut", join_broken_num=False)
+# output:
+# ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
+#  '127', '.', '0', '.', '0', '.', '1']
+
+word_tokenize(text, engine="attacut", join_broken_num=True)
+# output:
+# ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
+
+
+

Tokenize with default and custom dictionaries:

+
from pythainlp.corpus.common import thai_words
+from pythainlp.tokenize import dict_trie
+
+text = 'ชินโซ อาเบะ เกิด 21 กันยายน'
+
+word_tokenize(text, engine="newmm")
+# output:
+# ['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ',
+#  'เกิด', ' ', '21', ' ', 'กันยายน']
+
+custom_dict_japanese_name = set(thai_words()
+custom_dict_japanese_name.add('ชินโซ')
+custom_dict_japanese_name.add('อาเบะ')
+
+trie = dict_trie(dict_source=custom_dict_japanese_name)
+
+word_tokenize(text, engine="newmm", custom_dict=trie))
+# output:
+# ['ชินโซ', ' ', 'อาเบะ', ' ',
+#  'เกิด', ' ', '21', ' ', 'กันยายน']
+
+
+

Splits text into words. This function is a fundamental tool for Thai language text analysis.

+
+ +
+
+pythainlp.tokenize.word_detokenize(segments: List[List[str]] | List[str], output: str = 'str') List[str] | str[source]
+

Word detokenizer.

+

This function will detokenize the list of words in each sentence into text.

+
+
Parameters:
+
    +
  • segments (str) – List of sentences, each with a list of words.

  • +
  • output (str) – the output type (str or list)

  • +
+
+
Returns:
+

the Thai text

+
+
Return type:
+

Union[str,List[str]]

+
+
Example:
+

+
+
from pythainlp.tokenize import word_detokenize
+
+print(word_detokenize(["เรา", "เล่น"]))
+# output: เราเล่น
+
+
+

Reverses the tokenization process, reconstructing text from tokenized units. Useful for text generation tasks.

+
+ +
+
+class pythainlp.tokenize.Tokenizer(custom_dict: Trie | Iterable[str] | str = [], engine: str = 'newmm', keep_whitespace: bool = True, join_broken_num: bool = True)[source]
+

Tokenizer class for a custom tokenizer.

+

This class allows users to pre-define custom dictionary along with +tokenizer and encapsulate them into one single object. +It is an wrapper for both functions, that are +pythainlp.tokenize.word_tokenize(), +and pythainlp.util.dict_trie()

+
+
Example:
+

+
+

Tokenizer object instantiated with pythainlp.util.Trie:

+
from pythainlp.tokenize import Tokenizer
+from pythainlp.corpus.common import thai_words
+from pythainlp.util import dict_trie
+
+custom_words_list = set(thai_words())
+custom_words_list.add('อะเฟเซีย')
+custom_words_list.add('Aphasia')
+trie = dict_trie(dict_source=custom_words_list)
+
+text = "อะเฟเซีย (Aphasia*) เป็นอาการผิดปกติของการพูด"
+_tokenizer = Tokenizer(custom_dict=trie, engine='newmm')
+_tokenizer.word_tokenize(text)
+# output: ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
+'ผิดปกติ', 'ของ', 'การ', 'พูด']
+
+
+

Tokenizer object instantiated with a list of words:

+
text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
+_tokenizer = Tokenizer(custom_dict=list(thai_words()), engine='newmm')
+_tokenizer.word_tokenize(text)
+# output:
+# ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
+#   'ผิดปกติ', 'ของ', 'การ', 'พูด']
+
+
+

Tokenizer object instantiated with a file path containing a list of +words separated with newline and explicitly setting a new tokenizer +after initiation:

+
PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt'
+
+# write a file
+with open(PATH_TO_CUSTOM_DICTIONARY, 'w', encoding='utf-8') as f:
+    f.write('อะเฟเซีย\nAphasia\nผิด\nปกติ')
+
+text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
+
+# initiate an object from file with `attacut` as tokenizer
+_tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \
+    engine='attacut')
+
+_tokenizer.word_tokenize(text)
+# output:
+# ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิด',
+#   'ปกติ', 'ของ', 'การ', 'พูด']
+
+# change tokenizer to `newmm`
+_tokenizer.set_tokenizer_engine(engine='newmm')
+_tokenizer.word_tokenize(text)
+# output:
+# ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็นอาการ', 'ผิด',
+#   'ปกติ', 'ของการพูด']
+
+
+

The Tokenizer class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.

+
+
+__init__(custom_dict: Trie | Iterable[str] | str = [], engine: str = 'newmm', keep_whitespace: bool = True, join_broken_num: bool = True)[source]
+

Initialize tokenizer object.

+
+
Parameters:
+
    +
  • custom_dict (str) – a file path, a list of vocaburaies* to be +used to create a trie, or an instantiated +pythainlp.util.Trie object.

  • +
  • engine (str) – choose between different options of tokenizer engines +(i.e. newmm, mm, longest, deepcut)

  • +
  • keep_whitespace (bool) – True to keep whitespace, a common mark +for end of phrase in Thai

  • +
+
+
+
+ +
+
+word_tokenize(text: str) List[str][source]
+

Main tokenization function.

+
+
Parameters:
+

text (str) – text to be tokenized

+
+
Returns:
+

list of words, tokenized from the text

+
+
Return type:
+

list[str]

+
+
+
+ +
+
+set_tokenize_engine(engine: str) None[source]
+

Set the tokenizer’s engine.

+
+
Parameters:
+

engine (str) – choose between different options of tokenizer engines +(i.e. newmm, mm, longest, deepcut)

+
+
+
+ +
+ +
+
+class pythainlp.tokenize.display_cell_tokenize(text: str)[source]
+

Display cell tokenizer.

+

Tokenizes Thai text into display cells without splitting tone marks.

+
+
Parameters:
+

text (str) – text to be tokenized

+
+
Returns:
+

list of display cells

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+

Tokenize Thai text into display cells:

+
from pythainlp.tokenize import display_cell_tokenize
+
+text = "แม่น้ำอยู่ที่ไหน"
+display_cell_tokenize(text)
+# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
+
+
+
+ +
+
+

Tokenization Engines

+

This module offers multiple tokenization engines designed for different levels of text analysis.

+
+
+

Sentence level

+

crfcut

+

CRFCut - Thai sentence segmenter.

+

Thai sentence segmentation using conditional random field, +with default model trained on TED dataset

+

Performance: +- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art

+
+
+
    +
  • TED dataset - space-correct accuracy 82%

  • +
+

See development notebooks at https://github.com/vistec-AI/ted_crawler; +POS features are not used due to unreliable POS tagging available

+

A tokenizer that operates at the sentence level using Conditional Random Fields (CRF). It is suitable for segmenting text into sentences accurately.

+
+
+pythainlp.tokenize.crfcut.extract_features(doc: List[str], window: int = 2, max_n_gram: int = 3) List[List[str]][source]
+

Extract features for CRF by sliding max_n_gram of tokens +for +/- window from the current token

+
+
Parameters:
+
    +
  • doc (List[str]) – tokens from which features are to be extracted

  • +
  • window (int) – size of window before and after the current token

  • +
  • max_n_gram (int) – create n_grams from 1-gram to max_n_gram-gram within the window

  • +
+
+
Returns:
+

list of lists of features to be fed to CRF

+
+
+
+ +
+
+pythainlp.tokenize.crfcut.segment(text: str) List[str][source]
+

CRF-based sentence segmentation.

+
+
Parameters:
+

text (str) – text to be tokenized into sentences

+
+
Returns:
+

list of words, tokenized from the text

+
+
+
+ +

thaisumcut

+

The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020 +original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum

+

Cite:

+
+
@mastersthesis{chumpolsathien_2020,

title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization}, +author={Chumpolsathien, Nakhun}, +year={2020}, +school={Beijing Institute of Technology}

+
+
+

A sentence tokenizer based on a maximum entropy model. It’s a great choice for sentence boundary detection in Thai text.

+
+
+pythainlp.tokenize.thaisumcut.list_to_string(list: List[str]) str[source]
+
+ +
+
+pythainlp.tokenize.thaisumcut.middle_cut(sentences: List[str]) List[str][source]
+
+ +
+
+class pythainlp.tokenize.thaisumcut.ThaiSentenceSegmentor[source]
+
+
+split_into_sentences(text: str, isMiddleCut: bool = False) List[str][source]
+
+ +
+ +
+
+

Word level

+

attacut

+

Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai

+
+
See Also:
+
+
+
+

A tokenizer designed for word-level segmentation. It provides accurate word boundary detection in Thai text.

+
+
+class pythainlp.tokenize.attacut.AttacutTokenizer(model='attacut-sc')[source]
+
+
+__init__(model='attacut-sc')[source]
+
+ +
+
+tokenize(text: str) List[str][source]
+
+ +
+ +
+
+pythainlp.tokenize.attacut.segment(text: str, model: str = 'attacut-sc') List[str][source]
+

Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai +:param str text: text to be tokenized to words +:param str model: model of word tokenizer model +:return: list of words, tokenized from the text +:rtype: list[str] +Options for model

+
+
    +
  • attacut-sc (default) using both syllable and character features

  • +
  • attacut-c using only character feature

  • +
+
+
+ +

deepcut

+

multi_cut

+

Multi cut – Thai word segmentation with maximum matching. +Original codes from Korakot Chaovavanich.

+
+
See Also:
+
+
+
+

An ensemble tokenizer that combines multiple tokenization strategies for improved word segmentation.

+
+
+class pythainlp.tokenize.multi_cut.LatticeString(value, multi=None, in_dict=True)[source]
+

String that keeps possible tokenizations

+
+
+__init__(value, multi=None, in_dict=True)[source]
+
+ +
+ +
+
+pythainlp.tokenize.multi_cut.mmcut(text: str) List[str][source]
+
+ +
+
+pythainlp.tokenize.multi_cut.segment(text: str, custom_dict: ~pythainlp.util.trie.Trie = <pythainlp.util.trie.Trie object>) List[str][source]
+

Dictionary-based maximum matching word segmentation.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • custom_dict (Trie, optional) – tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE

  • +
+
+
Returns:
+

list of segmented tokens

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+pythainlp.tokenize.multi_cut.find_all_segment(text: str, custom_dict: ~pythainlp.util.trie.Trie = <pythainlp.util.trie.Trie object>) List[str][source]
+

Get all possible segment variations.

+
+
Parameters:
+
    +
  • text (str) – input string to be tokenized

  • +
  • custom_dict (Trie, optional) – tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE

  • +
+
+
Returns:
+

list of segment variations

+
+
Return type:
+

List[str]

+
+
+
+ +

nlpo3

+

A word tokenizer based on the NLPO3 model. It offers advanced word boundary detection and is suitable for various NLP tasks.

+
+
+pythainlp.tokenize.nlpo3.load_dict(file_path: str, dict_name: str) bool[source]
+

Load a dictionary file into an in-memory dictionary collection.

+

The loaded dictionary will be accessible through the assigned dict_name. +* This function will not override an existing dict name. *

+
+
Parameters:
+
    +
  • file_path (str) – Path to a dictionary file

  • +
  • dict_name (str) – A unique dictionary name, used for reference.

  • +
+
+
Return success:
+

True if loaded successfully, False otherwise.

+
+
Return type:
+

bool

+
+
See Also:
+
+
+
+
+ +
+
+pythainlp.tokenize.nlpo3.segment(text: str, custom_dict: str = '_73bcj049dzbu9t49b4va170k', safe_mode: bool = False, parallel_mode: bool = False) List[str][source]
+

Break text into tokens.

+

Python binding for nlpO3. It is newmm engine in Rust.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • custom_dict (str) – dictionary name, as assigned with load_dict(), defaults to pythainlp/corpus/common/words_th.txt

  • +
  • safe_mode (bool) – reduce chance for long processing time for long text with many ambiguous breaking points, defaults to False

  • +
  • parallel_mode (bool) – Use multithread mode, defaults to False

  • +
+
+
Returns:
+

list of tokens

+
+
Return type:
+

List[str]

+
+
See Also:
+
+
+
+
+ +

longest

+

Dictionary-based longest-matching Thai word segmentation. Implementation is based +on the codes from Patorn Utenpattanun.

+
+
See Also:
+
+
+
+

A tokenizer that identifies word boundaries by selecting the longest possible words in a text.

+
+
+class pythainlp.tokenize.longest.LongestMatchTokenizer(trie: Trie)[source]
+
+
+__init__(trie: Trie)[source]
+
+ +
+
+tokenize(text: str) List[str][source]
+
+ +
+ +
+
+pythainlp.tokenize.longest.segment(text: str, custom_dict: ~pythainlp.util.trie.Trie = <pythainlp.util.trie.Trie object>) List[str][source]
+

Dictionary-based longest matching word segmentation.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized into words

  • +
  • custom_dict (pythainlp.util.Trie) – dictionary for tokenization

  • +
+
+
Returns:
+

list of words, tokenized from the text

+
+
+
+ +

pyicu

+

Wrapper for PyICU word segmentation. This wrapper module uses +icu.BreakIterator with Thai as icu.Local +to locate boundaries between words in the text.

+
+
See Also:
+
+
+
+

An ICU-based word tokenizer offering robust support for Thai text segmentation.

+
+
+pythainlp.tokenize.pyicu.segment(text: str) List[str][source]
+
+
Parameters:
+

text (str) – text to be tokenized into words

+
+
Returns:
+

list of words, tokenized from the text

+
+
+
+ +

nercut

+

nercut 0.2

+

Dictionary-based maximal matching word segmentation, constrained by +Thai Character Cluster (TCC) boundaries, and combining tokens that are +parts of the same named entity.

+

Code by Wannaphong Phatthiyaphaibun

+

A tokenizer optimized for Named Entity Recognition (NER) tasks, ensuring accurate tokenization for entity recognition.

+
+
+pythainlp.tokenize.nercut.segment(text: str, taglist: ~typing.Iterable[str] = ['ORGANIZATION', 'PERSON', 'PHONE', 'EMAIL', 'DATE', 'TIME'], tagger=<pythainlp.tag.named_entity.NER object>) List[str][source]
+

Dictionary-based maximal matching word segmentation, constrained by +Thai Character Cluster (TCC) boundaries, and combining tokens that are +parts of the same named-entity.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized into words

  • +
  • taglist (list) – a list of named entity tags to be used

  • +
  • tagger (class) – NER tagger engine

  • +
+
+
Returns:
+

list of words, tokenized from the text

+
+
+
+ +

sefr_cut

+

Wrapper for SEFR CUT Thai word segmentation. SEFR CUT is a +Thai Word Segmentation Models using Stacked Ensemble.

+
+
See Also:
+
+
+
+

An advanced word tokenizer for segmenting Thai text, with a focus on precision.

+
+
+pythainlp.tokenize.sefr_cut.segment(text: str, engine: str = 'ws1000') List[str][source]
+
+ +

oskut

+

Wrapper OSKut (Out-of-domain StacKed cut for Word Segmentation). +Handling Cross- and Out-of-Domain Samples in Thai Word Segmentation +Stacked Ensemble Framework and DeepCut as Baseline model (ACL 2021 Findings)

+
+
See Also:
+
+
+
+

A tokenizer that uses a pre-trained model for word segmentation. It’s a reliable choice for general-purpose text analysis.

+
+
+pythainlp.tokenize.oskut.segment(text: str, engine: str = 'ws') List[str][source]
+
+ +

newmm (Default)

+

Dictionary-based maximal matching word segmentation, constrained by +Thai Character Cluster (TCC) boundaries with improved rules.

+

The codes are based on the notebooks created by Korakot Chaovavanich, +with heuristic graph size limit added to avoid exponential waiting time.

+
+
See Also:
+
+
+
+

The default word tokenization engine that provides a balance between accuracy and efficiency for most use cases.

+
+
+pythainlp.tokenize.newmm.segment(text: str, custom_dict: ~pythainlp.util.trie.Trie = <pythainlp.util.trie.Trie object>, safe_mode: bool = False) List[str][source]
+

Maximal-matching word segmentation constrained by Thai Character Cluster.

+

A dictionary-based word segmentation using maximal matching algorithm, +constrained by Thai Character Cluster boundaries.

+

A custom dictionary can be supplied.

+
+
Parameters:
+
    +
  • text (str) – text to be tokenized

  • +
  • custom_dict (Trie, optional) – tokenization dictionary, defaults to DEFAULT_WORD_DICT_TRIE

  • +
  • safe_mode (bool, optional) – reduce chance for long processing time for long text with many ambiguous breaking points, defaults to False

  • +
+
+
Returns:
+

list of tokens

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+

Subword level

+

tcc

+

The implementation of tokenizer according to Thai Character Clusters (TCCs) +rules proposed by Theeramunkong et al. 2000.

+
+
Credits:
+
+
+

Tokenizes text into Thai Character Clusters (TCCs), a subword level representation.

+
+
+pythainlp.tokenize.tcc.tcc(text: str) str[source]
+

TCC generator which generates Thai Character Clusters

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

subwords (character clusters)

+
+
Return type:
+

Iterator[str]

+
+
+
+ +
+
+pythainlp.tokenize.tcc.tcc_pos(text: str) Set[int][source]
+

TCC positions

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

list of the ending position of subwords

+
+
Return type:
+

set[int]

+
+
+
+ +
+
+pythainlp.tokenize.tcc.segment(text: str) List[str][source]
+

Subword segmentation

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

list of subwords (character clusters), tokenized from the text

+
+
Return type:
+

list[str]

+
+
+
+ +

tcc+

+

The implementation of tokenizer according to Thai Character Clusters (TCCs) +rules proposed by Theeramunkong et al. 2000. +and improved rules that are used in newmm

+
+
Credits:
+
+
+

A subword tokenizer that includes additional rules for more precise subword segmentation.

+
+
+pythainlp.tokenize.tcc_p.tcc(text: str) str[source]
+

TCC generator which generates Thai Character Clusters

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

subwords (character clusters)

+
+
Return type:
+

Iterator[str]

+
+
+
+ +
+
+pythainlp.tokenize.tcc_p.tcc_pos(text: str) Set[int][source]
+

TCC positions

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

list of the ending position of subwords

+
+
Return type:
+

set[int]

+
+
+
+ +
+
+pythainlp.tokenize.tcc_p.segment(text: str) List[str][source]
+

Subword segmentation

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

list of subwords (character clusters), tokenized from the text

+
+
Return type:
+

list[str]

+
+
+
+ +

etcc

+

Segmenting text into Enhanced Thai Character Clusters (ETCCs) +Python implementation by Wannaphong Phatthiyaphaibun

+

This implementation relies on a dictionary of ETCC created from etcc.txt +in pythainlp/corpus.

+

Notebook: +https://colab.research.google.com/drive/1UTQgxxMRxOr9Jp1B1jcq1frBNvorhtBQ

+
+
See Also:
+

+
+

Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and +Para Limmaneepraserth. “Thai word segmentation using combination of forward +and backward longest matching techniques.” In International Symposium on +Communications and Information Technology (ISCIT), pp. 37-40. 2001.

+

Enhanced Thai Character Clusters (eTCC) tokenizer for subword-level analysis.

+
+
+pythainlp.tokenize.etcc.segment(text: str) List[str][source]
+

Segmenting text into ETCCs.

+

Enhanced Thai Character Cluster (ETCC) is a kind of subword unit. +The concept was presented in Inrut, Jeeragone, Patiroop Yuanghirun, +Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. +“Thai word segmentation using combination of forward and backward +longest matching techniques.” In International Symposium on Communications +and Information Technology (ISCIT), pp. 37-40. 2001.

+
+
Parameters:
+

text (str) – text to be tokenized into character clusters

+
+
Returns:
+

list of clusters, tokenized from the text

+
+
Returns:
+

List[str]

+
+
+
+ +

han_solo

+

🪿 Han-solo: Thai syllable segmenter

+

GitHub: https://github.com/PyThaiNLP/Han-solo

+

A subword tokenizer specialized for Han characters and mixed scripts, suitable for various text processing scenarios.

+
+
+class pythainlp.tokenize.han_solo.Featurizer(N=2, sequence_size=1, delimiter=None)[source]
+
+
+__init__(N=2, sequence_size=1, delimiter=None)[source]
+
+ +
+
+pad(sentence, padder='#')[source]
+
+ +
+
+featurize(sentence, padding=True, indiv_char=True, return_type='list')[source]
+
+ +
+ +
+
+pythainlp.tokenize.han_solo.segment(text: str) List[str][source]
+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/tools.html b/5.1/api/tools.html new file mode 100644 index 0000000..a971fcc --- /dev/null +++ b/5.1/api/tools.html @@ -0,0 +1,261 @@ + + + + + + + + + pythainlp.tools — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.tools

+

The pythainlp.tools module encompasses a collection of miscellaneous functions primarily designed for internal use within the PyThaiNLP library. While these functions may not be directly exposed for external use, understanding their purpose can offer insights into the inner workings of PyThaiNLP.

+
+

Modules

+
+
+pythainlp.tools.get_full_data_path(path: str) str[source]
+

This function joins path of pythainlp data directory and the +given path, and returns the full path.

+
+
Returns:
+

full path given the name of dataset

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.tools import get_full_data_path
+
+get_full_data_path('ttc_freq.txt')
+# output: '/root/pythainlp-data/ttc_freq.txt'
+
+
+

Retrieves the full path to the PyThaiNLP data directory. This function is essential for internal data management, enabling PyThaiNLP to locate resources efficiently.

+
+ +
+
+pythainlp.tools.get_pythainlp_data_path() str[source]
+

Returns the full path where PyThaiNLP keeps its (downloaded) data. +If the directory does not yet exist, it will be created. +The path can be specified through the environment variable +PYTHAINLP_DATA_DIR. By default, ~/pythainlp-data +will be used.

+
+
Returns:
+

full path of directory for pythainlp downloaded data

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.tools import get_pythainlp_data_path
+
+get_pythainlp_data_path()
+# output: '/root/pythainlp-data'
+
+
+

Obtains the path to the PyThaiNLP data directory. This function is useful for accessing the library’s data resources for internal processes.

+
+ +
+
+pythainlp.tools.get_pythainlp_path() str[source]
+

This function returns full path of PyThaiNLP codes

+
+
Returns:
+

full path of pythainlp codes

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.tools import get_pythainlp_path
+
+get_pythainlp_path()
+# output: '/usr/local/lib/python3.6/dist-packages/pythainlp'
+
+
+

Returns the path to the PyThaiNLP library directory. This function is vital for PyThaiNLP’s internal operations and library management.

+
+ +
+
+pythainlp.tools.misspell.misspell(sentence: str, ratio: float = 0.05)[source]
+

Simulate some misspellings of the input sentence. +The number of misspelled locations is governed by ratio.

+
+
Params str sentence:
+

sentence to be misspelled

+
+
Params float ratio:
+

number of misspells per 100 chars. Defaults to 0.5.

+
+
Returns:
+

sentence containing some misspelled words

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.tools.misspell import misspell
+
+sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"
+
+misspell(sent, ratio=0.1)
+# output:
+ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727
+
+
+

This module appears to be related to handling misspellings within PyThaiNLP. While not explicitly documented here, it likely provides functionality for identifying and correcting misspelled words, which can be crucial for text preprocessing and language processing tasks.

+
+ +

The pythainlp.tools module contains these functions, which are mainly intended for PyThaiNLP’s internal workings. While they may not be directly utilized by external users, they play a pivotal role in ensuring the smooth operation of the library. Understanding the purpose of these functions can be valuable for contributors and developers working on PyThaiNLP, as it sheds light on the internal mechanisms and data management within the library.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/translate.html b/5.1/api/translate.html new file mode 100644 index 0000000..36e607d --- /dev/null +++ b/5.1/api/translate.html @@ -0,0 +1,492 @@ + + + + + + + + + pythainlp.translate — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.translate

+

The pythainlp.translate module is dedicated to machine translation capabilities for the PyThaiNLP library. It provides tools for translating text between different languages, making it a valuable resource for natural language processing tasks.

+
+

Modules

+
+
+class pythainlp.translate.Translate(src_lang: str, target_lang: str, engine: str = 'default', use_gpu: bool = False)[source]
+

Machine Translation

+

The Translate class is the central component of the module, offering a unified interface for various translation tasks. It acts as a coordinator, directing translation requests to specific language pairs and models.

+
+
+__init__(src_lang: str, target_lang: str, engine: str = 'default', use_gpu: bool = False) None[source]
+
+
Parameters:
+
    +
  • src_lang (str) – source language

  • +
  • target_lang (str) – target language

  • +
  • engine (str) – machine translation engine

  • +
  • use_gpu (bool) – load model using GPU (Default is False)

  • +
+
+
+
+
**Options for engine*
    +
  • default - The default engine for each language.

  • +
  • small100 - A multilingual machine translation model (covering 100 languages)

  • +
+
+
Options for source & target language
    +
  • th - en - Thai to English

  • +
  • en - th - English to Thai

  • +
  • th - zh - Thai to Chinese

  • +
  • zh - th - Chinese to Thai

  • +
  • th - fr - Thai to French

  • +
  • th - xx - Thai to xx (xx is language code). It uses small100 model.

  • +
  • xx - th - xx to Thai (xx is language code). It uses small100 model.

  • +
+
+
+
+
Example:
+

+
+

Translate text from Thai to English:

+
from pythainlp.translate import Translate
+
+th2en = Translate("th", "en")
+
+th2en.translate("ฉันรักแมว")
+# output: I love cat.
+
+
+
+ +
+
+load_model()[source]
+
+ +
+
+translate(text) str[source]
+

Translate text

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
+
+ +
+ +
+
+pythainlp.translate.en_th.download_model_all() None[source]
+

Download all translation models in advance

+

This function facilitates the download of all available English to Thai translation models. It ensures that the required models are accessible for translation tasks, enhancing the usability of the module.

+
+ +
+
+class pythainlp.translate.en_th.EnThTranslator(use_gpu: bool = False)[source]
+

English-Thai Machine Translation

+

from VISTEC-depa Thailand Artificial Intelligence Research Institute

+

Website: https://airesearch.in.th/releases/machine-translation-models/

+

:param bool use_gpu : load model using GPU (Default is False)

+

The EnThTranslator class specializes in translating text from English to Thai. It offers a range of methods for translating sentences and text, enabling accurate and meaningful translations between these languages.

+
+
+__init__(use_gpu: bool = False)[source]
+
+ +
+
+translate(text: str) str[source]
+

Translate text from English to Thai

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
Example:
+

+
+

Translate text from English to Thai:

+
from pythainlp.translate import EnThTranslator
+
+enth = EnThTranslator()
+
+enth.translate("I love cat.")
+# output: ฉันรักแมว
+
+
+
+ +
+ +
+
+class pythainlp.translate.en_th.ThEnTranslator(use_gpu: bool = False)[source]
+

Thai-English Machine Translation

+

from VISTEC-depa Thailand Artificial Intelligence Research Institute

+

Website: https://airesearch.in.th/releases/machine-translation-models/

+

:param bool use_gpu : load model using GPU (Default is False)

+

Conversely, the ThEnTranslator class focuses on translating text from Thai to English. It provides functionality for translating Thai text into English, contributing to effective language understanding and communication.

+
+
+__init__(use_gpu: bool = False)[source]
+
+ +
+
+translate(text: str) str[source]
+

Translate text from Thai to English

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
Example:
+

+
+

Translate text from Thai to English:

+
from pythainlp.translate import ThEnTranslator
+
+then = ThEnTranslator()
+
+then.translate("ฉันรักแมว")
+# output: I love cat.
+
+
+
+ +
+ +
+
+class pythainlp.translate.zh_th.ThZhTranslator(use_gpu: bool = False, pretrained: str = 'Lalita/marianmt-th-zh_cn')[source]
+

Thai-Chinese Machine Translation

+

from Lalita @ AI builder

+ +

:param bool use_gpu : load model using GPU (Default is False)

+

The ThZhTranslator class specializes in translating text from Thai to Chinese (Simplified). This class is valuable for bridging language gaps between these two languages, promoting cross-cultural communication.

+
+
+__init__(use_gpu: bool = False, pretrained: str = 'Lalita/marianmt-th-zh_cn') None[source]
+
+ +
+
+translate(text: str) str[source]
+

Translate text from Thai to Chinese

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
Example:
+

+
+

Translate text from Thai to Chinese:

+
from pythainlp.translate import ThZhTranslator
+
+thzh = ThZhTranslator()
+
+thzh.translate("ผมรักคุณ")
+# output: 我爱你
+
+
+
+ +
+ +
+
+class pythainlp.translate.zh_th.ZhThTranslator(use_gpu: bool = False, pretrained: str = 'Lalita/marianmt-zh_cn-th')[source]
+

Chinese-Thai Machine Translation

+

from Lalita @ AI builder

+ +

:param bool use_gpu : load model using GPU (Default is False)

+

The ZhThTranslator class is designed for translating text from Chinese (Simplified) to Thai. It assists in making content accessible to Thai-speaking audiences by converting Chinese text into Thai.

+
+
+__init__(use_gpu: bool = False, pretrained: str = 'Lalita/marianmt-zh_cn-th') None[source]
+
+ +
+
+translate(text: str) str[source]
+

Translate text from Chinese to Thai

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
Example:
+

+
+

Translate text from Chinese to Thai:

+
from pythainlp.translate import ZhThTranslator
+
+zhth = ZhThTranslator()
+
+zhth.translate("我爱你")
+# output: ผมรักคุณนะ
+
+
+
+ +
+ +
+
+class pythainlp.translate.th_fr.ThFrTranslator(use_gpu: bool = False, pretrained: str = 'Helsinki-NLP/opus-mt-th-fr')[source]
+

Thai-French Machine Translation

+

Trained by OPUS Corpus

+

Model is from Language Technology Research Group at the University of Helsinki

+

BLEU 20.4

+ +

:param bool use_gpu : load model using GPU (Default is False)

+

Lastly, the ThFrTranslator class specializes in translating text from Thai to French. It serves as a tool for expanding language accessibility and promoting content sharing in French-speaking communities.

+
+
+__init__(use_gpu: bool = False, pretrained: str = 'Helsinki-NLP/opus-mt-th-fr') None[source]
+
+ +
+
+translate(text: str) str[source]
+

Translate text from Thai to French

+
+
Parameters:
+

text (str) – input text in source language

+
+
Returns:
+

translated text in target language

+
+
Return type:
+

str

+
+
Example:
+

+
+

Translate text from Thai to French:

+
from pythainlp.translate.th_fr import ThFrTranslator
+
+thfr = ThFrTranslator()
+
+thfr.translate("ทดสอบระบบ")
+# output: "Test du système."
+
+
+
+ +
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/transliterate.html b/5.1/api/transliterate.html new file mode 100644 index 0000000..877f5cf --- /dev/null +++ b/5.1/api/transliterate.html @@ -0,0 +1,559 @@ + + + + + + + + + pythainlp.transliterate — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.transliterate

+

The pythainlp.transliterate module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks.

+
+

Modules

+
+
+pythainlp.transliterate.romanize(text: str, engine: str = 'royin', fallback_engine: str = 'royin') str[source]
+

This function renders Thai words in the Latin alphabet or “romanization”, +using the Royal Thai General System of Transcription (RTGS) +[1]. RTGS is the official system published +by the Royal Institute of Thailand. (Thai: ถอดเสียงภาษาไทยเป็นอักษรละติน)

+
+
Parameters:
+
    +
  • text (str) – Thai text to be romanized

  • +
  • engine (str) – One of ‘royin’ (default), ‘thai2rom’, ‘thai2rom_onnx, ‘tltk’, and ‘lookup’. See more in options for engine section.

  • +
  • fallback_engine (str) – If engine equals ‘lookup’, use fallback_engine for words that are not in the transliteration dict. +No effect on other engines. Default to ‘royin’.

  • +
+
+
Returns:
+

A string of Thai words rendered in the Latin alphabet.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • royin - (default) based on the Royal Thai General System of +Transcription issued by Royal Institute of Thailand.

  • +
  • thai2rom - a deep learning-based Thai romanization engine +(require PyTorch).

  • +
  • thai2rom_onnx - a deep learning-based Thai romanization engine with ONNX runtime

  • +
  • tltk - TLTK: Thai Language Toolkit

  • +
  • lookup - Look up on Thai-English Transliteration dictionary v1.4 compiled by Wannaphong.

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import romanize
+
+romanize("สามารถ", engine="royin")
+# output: 'samant'
+
+romanize("สามารถ", engine="thai2rom")
+# output: 'samat'
+
+romanize("สามารถ", engine="tltk")
+# output: 'samat'
+
+romanize("ภาพยนตร์", engine="royin")
+# output: 'phapn'
+
+romanize("ภาพยนตร์", engine="thai2rom")
+# output: 'phapphayon'
+
+romanize("ภาพยนตร์", engine="thai2rom_onnx")
+# output: 'phapphayon'
+
+romanize("ก็อปปี้", engine="lookup")
+# output: 'copy'
+
+
+

The romanize function allows you to transliterate Thai text, converting it into a phonetic representation using the English alphabet. It’s a fundamental tool for rendering Thai words and phrases in a more familiar format.

+
+ +
+
+pythainlp.transliterate.transliterate(text: str, engine: str = 'thaig2p') str[source]
+

This function transliterates Thai text.

+
+
Parameters:
+
    +
  • text (str) – Thai text to be transliterated

  • +
  • engine (str) – ‘icu’, ‘ipa’, or ‘thaig2p’ (default)

  • +
+
+
Returns:
+

A string of phonetic alphabets indicating +how the input text should be pronounced.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • thaig2p - (default) Thai Grapheme-to-Phoneme, +output is IPA (require PyTorch)

  • +
  • icu - pyicu, based on International Components for Unicode (ICU)

  • +
  • ipa - epitran, output is International Phonetic Alphabet (IPA)

  • +
  • tltk_g2p - Thai Grapheme-to-Phoneme from TLTK.,

  • +
  • iso_11940 - Thai text into Latin characters with ISO 11940.

  • +
  • tltk_ipa - tltk, output is International Phonetic Alphabet (IPA)

  • +
  • thaig2p_v2 - Thai Grapheme-to-Phoneme, +output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import transliterate
+
+transliterate("สามารถ", engine="icu")
+# output: 's̄āmārt̄h'
+
+transliterate("สามารถ", engine="ipa")
+# output: 'saːmaːrot'
+
+transliterate("สามารถ", engine="thaig2p")
+# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
+
+transliterate("สามารถ", engine="tltk_ipa")
+# output: 'saː5.maːt3'
+
+transliterate("สามารถ", engine="tltk_g2p")
+# output: 'saa4~maat2'
+
+transliterate("สามารถ", engine="iso_11940")
+# output: 's̄āmārt̄h'
+
+transliterate("ภาพยนตร์", engine="icu")
+# output: 'p̣hāphyntr̒'
+
+transliterate("ภาพยนตร์", engine="ipa")
+# output: 'pʰaːpjanot'
+
+transliterate("ภาพยนตร์", engine="thaig2p")
+# output: 'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
+
+transliterate("ภาพยนตร์", engine="iso_11940")
+# output: 'p̣hāphyntr'
+
+
+

The transliterate function serves as a versatile transliteration tool, offering a range of transliteration engines to choose from. It provides flexibility and customization for your transliteration needs.

+
+ +
+
+pythainlp.transliterate.pronunciate(word: str, engine: str = 'w2p') str[source]
+

This function pronunciates Thai word.

+
+
Parameters:
+
    +
  • word (str) – Thai text to be pronunciated

  • +
  • engine (str) – ‘w2p’ (default)

  • +
+
+
Returns:
+

A string of Thai letters indicating +how the input text should be pronounced.

+
+
Return type:
+

str

+
+
Options for engines:
+
    +
  • w2p - Thai Word-to-Phoneme

  • +
+
+
Example:
+

+
+
from pythainlp.transliterate import pronunciate
+
+pronunciate("สามารถ", engine="w2p")
+# output: 'สา-มาด'
+
+pronunciate("ภาพยนตร์", engine="w2p")
+# output: 'พาบ-พะ-ยน'
+
+
+

This function provides assistance in generating phonetic representations of Thai words, which is particularly useful for language learning and pronunciation practice.

+
+ +
+
+pythainlp.transliterate.puan(word: str, show_pronunciation: bool = True) str[source]
+

Thai Spoonerism

+

This function converts Thai word to spoonerism word.

+
+
Parameters:
+
    +
  • word (str) – Thai word to be spoonerized

  • +
  • show_pronunciation (bool) – True (default) or False

  • +
+
+
Returns:
+

A string of Thai spoonerism word.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.transliterate import puan
+
+puan("นาริน")
+# output: 'นิน-รา'
+
+puan("นาริน", False)
+# output: 'นินรา'
+
+
+

The puan function offers a unique transliteration feature known as “Puan.” It provides a specialized transliteration method for Thai text and is an additional option for rendering Thai text into English characters.

+
+ +
+
+class pythainlp.transliterate.wunsen.WunsenTransliterate[source]
+

Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text +to Thai text +by Wunsen

+
+
See Also:
+
+
+
+

The WunsenTransliterate class represents a transliteration engine known as “Wunsen.” It offers specific transliteration methods for rendering Thai text into a phonetic English format.

+
+
+__init__() None[source]
+
+ +
+
+transliterate(text: str, lang: str, jp_input: str | None = None, zh_sandhi: bool | None = None, system: str | None = None)[source]
+

Use Wunsen for transliteration

+
+
Parameters:
+
    +
  • text (str) – text to be transliterated to Thai text.

  • +
  • lang (str) – source language

  • +
  • jp_input (str) – Japanese input method (for Japanese only)

  • +
  • zh_sandhi (bool) – Mandarin third tone sandhi option +(for Mandarin only)

  • +
  • system (str) – transliteration system (for Japanese and +Mandarin only)

  • +
+
+
Returns:
+

Thai text

+
+
Return type:
+

str

+
+
Options for lang:
+
    +
  • jp - Japanese (from Hepburn romanization)

  • +
  • ko - Korean (from Revised Romanization)

  • +
  • vi - Vietnamese (Latin script)

  • +
  • zh - Mandarin (from Hanyu Pinyin)

  • +
+
+
Options for jp_input:
+
    +
  • Hepburn-no diacritic - Hepburn-no diacritic (without macron)

  • +
+
+
Options for zh_sandhi:
+
    +
  • True - apply third tone sandhi rule

  • +
  • False - do not apply third tone sandhi rule

  • +
+
+
Options for system:
+
    +
  • +
    ORS61 - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น

    (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)

    +
    +
    +
  • +
  • +
    RI35 - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น

    (ราชบัณฑิตยสถาน พ.ศ. 2535)

    +
    +
    +
  • +
  • +
    RI49 - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน

    (ราชบัณฑิตยสถาน พ.ศ. 2549)

    +
    +
    +
  • +
  • +
    THC43 - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน

    ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร +ภาษาจีน พ.ศ. 2543)

    +
    +
    +
  • +
+
+
Example:
+

+
+
from pythainlp.transliterate.wunsen import WunsenTransliterate
+
+wt = WunsenTransliterate()
+
+wt.transliterate("ohayō", lang="jp")
+# output: 'โอฮาโย'
+
+wt.transliterate(
+    "ohayou",
+    lang="jp",
+    jp_input="Hepburn-no diacritic"
+)
+# output: 'โอฮาโย'
+
+wt.transliterate("ohayō", lang="jp", system="RI35")
+# output: 'โอะฮะโย'
+
+wt.transliterate("annyeonghaseyo", lang="ko")
+# output: 'อันนย็องฮาเซโย'
+
+wt.transliterate("xin chào", lang="vi")
+# output: 'ซีน จ่าว'
+
+wt.transliterate("ni3 hao3", lang="zh")
+# output: 'หนี เห่า'
+
+wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
+# output: 'หนี่ เห่า'
+
+wt.transliterate("ni3 hao3", lang="zh", system="RI49")
+# output: 'หนี ห่าว'
+
+
+
+ +
+ +
+
+

Transliteration Engines

+

thai2rom

+
+
+pythainlp.transliterate.thai2rom.romanize(text: str) str[source]
+

Romanize Thai text

+
+
Parameters:
+

text (str) – Thai text to be romanized

+
+
Returns:
+

Roman characters representing the pronunciation of the Thai text

+
+
Return type:
+

str

+
+
+

The thai2rom engine specializes in transliterating Thai text into romanized form. It’s particularly useful for rendering Thai words accurately in an English phonetic format.

+
+ +

royin

+
+
+pythainlp.transliterate.royin.romanize(text: str) str[source]
+

Render Thai words in Latin alphabet, using RTGS

+

Royal Thai General System of Transcription (RTGS), +is the official system by the Royal Institute of Thailand.

+
+
Parameters:
+

text (str) – Thai text to be romanized

+
+
Returns:
+

A string of Thai words rendered in the Latin alphabet

+
+
Return type:
+

str

+
+
+

The royin engine focuses on transliterating Thai text into English characters. It provides an alternative approach to transliteration, ensuring accurate representation of Thai words.

+
+ +

Transliterate Engines

+

This section includes multiple transliteration engines designed to suit various use cases. They offer unique methods for transliterating Thai text into romanized form:

+
    +
  • icu: Utilizes the ICU transliteration system for phonetic conversion.

  • +
  • ipa: Provides International Phonetic Alphabet (IPA) representation of Thai text.

  • +
  • thaig2p: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.

  • +
  • thaig2p_v2: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0

  • +
  • tltk: Utilizes the TLTK transliteration system for a specific approach to transliteration.

  • +
  • iso_11940: Focuses on the ISO 11940 transliteration standard.

  • +
+
+
+

References

+ +

The pythainlp.transliterate module offers a comprehensive set of tools and engines for transliterating Thai text into Romanized form. Whether you need a simple transliteration, specific engines for accurate representation, or phonetic rendering, this module provides a wide range of options. Additionally, the module references a publication that highlights the significance of Romanization, Transliteration, and Transcription in making the Thai language accessible to a global audience.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/ulmfit.html b/5.1/api/ulmfit.html new file mode 100644 index 0000000..9575ae8 --- /dev/null +++ b/5.1/api/ulmfit.html @@ -0,0 +1,594 @@ + + + + + + + + + pythainlp.ulmfit — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.ulmfit

+

Welcome to the pythainlp.ulmfit module, where you’ll find powerful tools for Universal Language Model Fine-tuning for Text Classification (ULMFiT). ULMFiT is a cutting-edge technique for training deep learning models on large text corpora and then fine-tuning them for specific text classification tasks.

+
+

Modules

+
+
+class pythainlp.ulmfit.ThaiTokenizer(lang: str = 'th')[source]
+

Wrapper around a frozen newmm tokenizer to make it a +fastai.BaseTokenizer. +(see: https://docs.fast.ai/text.transform#BaseTokenizer)

+

The ThaiTokenizer class is a critical component of ULMFiT, designed for tokenizing Thai text effectively. Tokenization is the process of breaking down text into individual tokens, and this class allows you to do so with precision and accuracy.

+
+
+__init__(lang: str = 'th')[source]
+
+ +
+
+static tokenizer(text: str) List[str][source]
+

This function tokenizes text using newmm engine and the dictionary +specifically for ulmfit related functions +(see: Dictionary file (.txt)). +:meth: tokenize text using a frozen newmm engine +:param str text: text to tokenize +:return: tokenized text +:rtype: list[str]

+
+
Example:
+

Using pythainlp.ulmfit.ThaiTokenizer.tokenizer() is +similar to pythainlp.tokenize.word_tokenize() +using ulmfit engine.

+
>>> from pythainlp.ulmfit import ThaiTokenizer
+>>> from pythainlp.tokenize import word_tokenize
+>>>
+>>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา"
+>>> ThaiTokenizer.tokenizer(text)
+ ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา',
+ ' ', 'ภาวนามยปัญญา']
+>>>
+>>> word_tokenize(text, engine='ulmfit')
+['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา',
+ ' ', 'ภาวนามยปัญญา']
+
+
+
+
+
+ +
+
+add_special_cases(toks)[source]
+
+ +
+ +
+
+pythainlp.ulmfit.document_vector(text: str, learn, data, agg: str = 'mean')[source]
+

This function vectorizes Thai input text into a 400 dimension vector using +fastai language model and data bunch.

+
+
Meth:
+

document_vector get document vector using fastai language model +and data bunch

+
+
Parameters:
+
    +
  • text (str) – text to be vectorized with fastai language model.

  • +
  • learnfastai language model learner

  • +
  • datafastai data bunch

  • +
  • agg (str) – name of aggregation methods for word embeddings +The available methods are “mean” and “sum”

  • +
+
+
Returns:
+

numpy.array of document vector sized 400 based on +the encoder of the model

+
+
Return type:
+

numpy.ndarray((1, 400))

+
+
Example:
+
>>> from pythainlp.ulmfit import document_vectorr
+>>> from fastai import *
+>>> from fastai.text import *
+>>>
+>>> # Load Data Bunch
+>>> data = load_data(MODEL_PATH, 'thwiki_lm_data.pkl')
+>>>
+>>> # Initialize language_model_learner
+>>> config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1,
+     qrnn=False, tie_weights=True, out_bias=True, output_p=0.25,
+     hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
+>>> trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
+>>> learn = language_model_learner(data, AWD_LSTM, config=config,
+                                   pretrained=False, **trn_args)
+>>> document_vector('วันนี้วันดีปีใหม่', learn, data)
+
+
+
+
See Also:
+
    +
  • A notebook showing how to train ulmfit language model and its +usage, Jupyter Notebook

  • +
+
+
+

The document_vector function is a powerful tool that computes document vectors for text data. This functionality is often used in text classification tasks where you need to represent documents as numerical vectors for machine learning models.

+
+ +
+
+pythainlp.ulmfit.fix_html(text: str) str[source]
+

Replace HTML strings in test. (codes from fastai)

+
+
Parameters:
+

text (str) – text to replace HTML strings in

+
+
Returns:
+

text with HTML strings replaced

+
+
Return type:
+

str

+
+
Example:
+
>>> from pythainlp.ulmfit import fix_html
+>>> fix_html("Anbsp;amp;nbsp;B @.@ ")
+A & B.
+
+
+
+
+

The fix_html function is a text preprocessing utility that handles HTML-specific characters, making text cleaner and more suitable for text classification.

+
+ +
+
+pythainlp.ulmfit.lowercase_all(toks: Collection[str]) List[str][source]
+

Lowercase all English words; +English words in Thai texts don’t usually have nuances of capitalization.

+

The lowercase_all function is a text processing utility that converts all text to lowercase. This is useful for ensuring uniformity in text data and reducing the complexity of text classification tasks.

+
+ +
+
+pythainlp.ulmfit.merge_wgts(em_sz, wgts, itos_pre, itos_new)[source]
+

This function is to insert new vocab into an existing model named wgts +and update the model’s weights for new vocab with the average embedding.

+
+
Meth:
+

merge_wgts insert pretrained weights and vocab into a new set +of weights and vocab; use average if vocab not in pretrained vocab

+
+
Parameters:
+
    +
  • em_sz (int) – embedding size

  • +
  • wgts – torch model weights

  • +
  • itos_pre (list) – pretrained list of vocab

  • +
  • itos_new (list) – list of new vocab

  • +
+
+
Returns:
+

merged torch model weights

+
+
Example:
+

+
+
from pythainlp.ulmfit import merge_wgts
+import torch
+
+wgts = {'0.encoder.weight': torch.randn(5,3)}
+itos_pre = ["แมว", "คน", "หนู"]
+itos_new = ["ปลา", "เต่า", "นก"]
+em_sz = 3
+
+merge_wgts(em_sz, wgts, itos_pre, itos_new)
+# output:
+# {'0.encoder.weight': tensor([[0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011]]),
+# '0.encoder_dp.emb.weight': tensor([[0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011]]),
+# '1.decoder.weight': tensor([[0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011],
+# [0.5952, 0.4453, 0.0011]])}
+
+
+

The merge_wgts function is a tool for merging weight arrays, which can be crucial for managing and fine-tuning deep learning models in ULMFiT.

+
+ +
+
+pythainlp.ulmfit.process_thai(text: str, pre_rules: ~typing.Collection = [<function fix_html>, <function reorder_vowels>, <function spec_add_spaces>, <function rm_useless_spaces>, <function rm_useless_newlines>, <function rm_brackets>, <function replace_url>, <function replace_rep_nonum>], tok_func: ~typing.Callable = <bound method Tokenizer.word_tokenize of <pythainlp.tokenize.core.Tokenizer object>>, post_rules: ~typing.Collection = [<function ungroup_emoji>, <function lowercase_all>, <function replace_wrep_post_nonum>, <function remove_space>]) Collection[str][source]
+

Process Thai texts for models (with sparse features as default)

+
+
Parameters:
+
    +
  • text (str) – text to be cleaned

  • +
  • pre_rules (list[func]) – rules to apply before tokenization.

  • +
  • tok_func (func) – tokenization function (by default, tok_func is +pythainlp.tokenize.word_tokenize())

  • +
  • post_rules (list[func]) – rules to apply after tokenizations

  • +
+
+
Returns:
+

a list of cleaned tokenized texts

+
+
Return type:
+

list[str]

+
+
Note:
+
    +
  • The default pre-rules consists of fix_html(), +pythainlp.util.normalize(), +spec_add_spaces(), +rm_useless_spaces(), +rm_useless_newlines(), +rm_brackets() +and replace_rep_nonum().

  • +
  • The default post-rules consists of ungroup_emoji(), +lowercase_all(), replace_wrep_post_nonum(), +and remove_space().

  • +
+
+
Example:
+
    +
  1. Use default pre-rules and post-rules:

  2. +
+
>>> from pythainlp.ulmfit import process_thai
+>>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp;     "
+>>> process_thai(text)
+[บ้าน', 'xxrep', '   ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣',
+'😃', '😄', '😅', 'pythainlp', '&']
+
+
+
    +
  1. Modify pre_rules and post_rules arguments with +rules provided in pythainlp.ulmfit:

  2. +
+
>>> from pythainlp.ulmfit import (
+    process_thai,
+    replace_rep_after,
+    fix_html,
+    ungroup_emoji,
+    replace_wrep_post,
+    remove_space)
+>>>
+>>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp;     "
+>>> process_thai(text,
+                 pre_rules=[replace_rep_after, fix_html],
+                 post_rules=[ungroup_emoji,
+                             replace_wrep_post,
+                             remove_space]
+                )
+['บ้าน', 'xxrep', '5', '()', 'อยู่', 'xxwrep', '2', 'นาน', '😂', '🤣',
+ '😃', '😄', '😅', 'PyThaiNLP', '&']
+
+
+
+
+

The process_thai function is designed for preprocessing Thai text data, a vital step in preparing text for ULMFiT-based text classification.

+
+ +
+
+pythainlp.ulmfit.rm_brackets(text: str) str[source]
+

Remove all empty brackets and artifacts within brackets from text.

+

The rm_brackets function removes brackets from text, making it more suitable for text classification tasks that don’t require bracket information.

+
+ +
+
+pythainlp.ulmfit.rm_useless_newlines(text: str) str[source]
+

Remove multiple newlines in text.

+

The rm_useless_newlines function eliminates unnecessary newlines in text data, ensuring that text is more compact and easier to work with in ULMFiT-based text classification.

+
+ +
+
+pythainlp.ulmfit.rm_useless_spaces(text: str) str[source]
+

Remove multiple spaces in text. (codes from fastai)

+

The rm_useless_spaces function removes extraneous spaces from text, making it cleaner and more efficient for ULMFiT-based text classification.

+
+ +
+
+pythainlp.ulmfit.remove_space(toks: Collection[str]) List[str][source]
+

Do not include space for bag-of-word models.

+
+
Parameters:
+

toks (list[str]) – list of tokens

+
+
Returns:
+

list of tokens where space tokens (” “) are filtered out

+
+
Return type:
+

list[str]

+
+
+

The remove_space function is a utility for removing space characters from text data, streamlining the text for classification purposes.

+
+ +
+
+pythainlp.ulmfit.replace_rep_after(text: str) str[source]
+

Replace repetitions at the character level in text after the repeated character. +This is to prevent cases such as ‘น้อยยยยยยยย’ becomes ‘น้อ xxrep 8 ย’ +; instead it will retain the word as ‘น้อย xxrep 8’

+
+
Parameters:
+

text (str) – input text to replace character repetitions in

+
+
Returns:
+

text with repetitive token xxrep and the counter +after the repeated character

+
+
Return type:
+

str

+
+
Example:
+
>>> from pythainlp.ulmfit import replace_rep_after
+>>>
+>>> text = "กาาาาาาา"
+>>> replace_rep_after(text)
+'กาxxrep7 '
+
+
+
+
+

The replace_rep_after function is a text preprocessing tool for replacing repeated characters in text with a single occurrence. This step helps in standardizing text data for text classification.

+
+ +
+
+pythainlp.ulmfit.replace_rep_nonum(text: str) str[source]
+

Replace repetitions at the character level in text after the repetition. +This is done to prevent such case as ‘น้อยยยยยยยย’ becoming ‘น้อ xxrep ย’; +instead it will retain the word as ‘น้อย xxrep ‘

+
+
Parameters:
+

text (str) – input text to replace character repetition

+
+
Returns:
+

text with repetitive token xxrep after +character repetition

+
+
Return type:
+

str

+
+
Example:
+
>>> from pythainlp.ulmfit import replace_rep_nonum
+>>>
+>>> text = "กาาาาาาา"
+>>> replace_rep_nonum(text)
+'กา xxrep '
+
+
+
+
+

The replace_rep_nonum function is similar to replace_rep_after, but it focuses on replacing repeated characters without considering numbers.

+
+ +
+
+pythainlp.ulmfit.replace_wrep_post(toks: Collection[str]) List[str][source]
+

Replace repetitive words after tokenization; +fastai replace_wrep does not work well with Thai.

+
+
Parameters:
+

toks (list[str]) – list of tokens

+
+
Returns:
+

list of tokens where xxwrep token and the counter +is added before repetitive words.

+
+
Return type:
+

list[str]

+
+
Example:
+
>>> from pythainlp.ulmfit import replace_wrep_post_nonum
+>>>
+>>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
+>>> replace_wrep_post(toks)
+['กา', 'xxwrep', '3', 'น้ำ']
+
+
+
+
+

The replace_wrep_post function is used for replacing repeated words in text with a single occurrence. This function helps in reducing redundancy in text data, making it more efficient for text classification tasks.

+
+ +
+
+pythainlp.ulmfit.replace_wrep_post_nonum(toks: Collection[str]) List[str][source]
+

Replace reptitive words post tokenization; +fastai replace_wrep does not work well with Thai.

+
+
Parameters:
+

toks (list[str]) – list of tokens

+
+
Returns:
+

list of tokens where xxwrep token is added in front of +repetitive words.

+
+
Return type:
+

list[str]

+
+
Example:
+
>>> from pythainlp.ulmfit import replace_wrep_post_nonum
+>>>
+>>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
+>>> replace_wrep_post_nonum(toks)
+['กา', 'xxwrep', 'น้ำ']
+
+
+
+
+

Similar to replace_wrep_post, the replace_wrep_post_nonum function removes repeated words without considering numbers in the text.

+
+ +
+
+pythainlp.ulmfit.spec_add_spaces(text: str) str[source]
+

Add spaces around / and # in text. +(codes from fastai)

+

The spec_add_spaces function is a text processing tool for adding spaces between special characters in text data. This step helps in standardizing text for ULMFiT-based text classification.

+
+ +
+
+pythainlp.ulmfit.ungroup_emoji(toks: Collection[str]) List[str][source]
+

Ungroup Zero Width Joiner (ZVJ) Emojis

+

See https://emojipedia.org/emoji-zwj-sequence/

+

The ungroup_emoji function is designed for ungrouping emojis in text data, which can be crucial for emoji recognition and classification tasks.

+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/util.html b/5.1/api/util.html new file mode 100644 index 0000000..4541008 --- /dev/null +++ b/5.1/api/util.html @@ -0,0 +1,2188 @@ + + + + + + + + + pythainlp.util — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.util

+

The pythainlp.util module serves as a treasure trove of utility functions designed to aid text conversion, formatting, and various language processing tasks in the context of Thai language.

+
+

Modules

+
+
+pythainlp.util.abbreviation_to_full_text(text: str, top_k: int = 2) List[Tuple[str, float | None]][source]
+

This function converts Thai text (with abbreviation) to full text.

+

This function uses KhamYo for handles abbreviations. +See more KhamYo.

+
+
Parameters:
+
    +
  • text (str) – Thai text

  • +
  • top_k (int) – Top K

  • +
+
+
Returns:
+

Thai full text with abbreviations converted to full text and cos scores (original text - modified text).

+
+
Return type:
+

List[Tuple[str, Union[float, None]]]

+
+
Example:
+

+
+
from pythainlp.util import abbreviation_to_full_text
+
+text = "รร.ของเราน่าอยู่"
+
+abbreviation_to_full_text(text)
+# output: [
+# ('โรงเรียนของเราน่าอยู่', tensor(0.3734)), 
+# ('โรงแรมของเราน่าอยู่', tensor(0.2438))
+# ]
+
+
+

The abbreviation_to_full_text function is a text processing tool for converting common Thai abbreviations into their full, expanded forms. It’s invaluable for improving text readability and clarity.

+
+ +
+
+pythainlp.util.arabic_digit_to_thai_digit(text: str) str[source]
+

This function converts Arabic digits (i.e. 1, 3, 10) to Thai digits +(i.e. ๑, ๓, ๑๐).

+
+
Parameters:
+

text (str) – Text with Arabic digits such as ‘1’, ‘2’, ‘3’

+
+
Returns:
+

Text with Arabic digits converted to Thai digits +such as ‘๑’, ‘๒’, ‘๓’

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import arabic_digit_to_thai_digit
+
+text = 'เป็นจำนวน 123,400.25 บาท'
+
+arabic_digit_to_thai_digit(text)
+# output: เป็นจำนวน ๑๒๓,๔๐๐.๒๕ บาท
+
+
+

The arabic_digit_to_thai_digit function allows you to transform Arabic numerals into their Thai numeral equivalents. This utility is especially useful when working with Thai numbers in text data.

+
+ +
+
+pythainlp.util.bahttext(number: float) str[source]
+

This function converts a number to Thai text and adds +a suffix “บาท” (Baht). +The precision will be fixed at two decimal places (0.00) +to fits “สตางค์” (Satang) unit. +This function works similar to BAHTTEXT function in Microsoft Excel.

+
+
Parameters:
+

number (float) – number to be converted into Thai Baht currency format

+
+
Returns:
+

text representing the amount of money in the format +of Thai currency

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import bahttext
+
+bahttext(1)
+# output: หนึ่งบาทถ้วน
+
+bahttext(21)
+# output: ยี่สิบเอ็ดบาทถ้วน
+
+bahttext(200)
+# output: สองร้อยบาทถ้วน
+
+
+

The bahttext function specializes in converting numerical values into Thai Baht text, an essential feature for rendering financial data or monetary amounts in a user-friendly Thai format.

+
+ +
+
+pythainlp.util.convert_years(year: str, src='be', target='ad') str[source]
+

Convert years

+
+
Parameters:
+
    +
  • year (int) – Year

  • +
  • src (str) – The source year

  • +
  • target (str) – The target year

  • +
+
+
Returns:
+

The converted year

+
+
Return type:
+

str

+
+
+
+
Options for year
    +
  • be - Buddhist calendar

  • +
  • ad - Anno Domini

  • +
  • re - Rattanakosin era

  • +
  • ah - Anno Hejira

  • +
+
+
+

Warning: This function works properly only after 1941 because Thailand has change the Thai calendar in 1941. +If you are the time traveler or the historian, you should care about the correct calendar.

+

The convert_years function is designed to facilitate the conversion of Western calendar years into Thai Buddhist Era (BE) years. This is significant for presenting dates and years in a Thai context.

+
+ +
+
+pythainlp.util.collate(data: Iterable, reverse: bool = False) List[str][source]
+

This function sorts strings (almost) according to Thai dictionary.

+

Important notes: this implementation ignores tone marks and symbols

+
+
Parameters:
+
    +
  • data (Iterable) – a list of words to be sorted

  • +
  • reverse (bool, optional) – If reverse is set to True the result will be +sorted in descending order. Otherwise, the result +will be sorted in ascending order, defaults to False

  • +
+
+
Returns:
+

a list of strings, sorted alphabetically, (almost) according to +Thai dictionary

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util import collate
+
+collate(['ไก่', 'เกิด', 'กาล', 'เป็ด', 'หมู', 'วัว', 'วันที่'])
+# output: ['กาล', 'เกิด', 'ไก่', 'เป็ด', 'วันที่', 'วัว', 'หมู']
+
+collate(['ไก่', 'เกิด', 'กาล', 'เป็ด', 'หมู', 'วัว', 'วันที่'], \
+    reverse=True)
+# output: ['หมู', 'วัว', 'วันที่', 'เป็ด', 'ไก่', 'เกิด', 'กาล']
+
+
+

The collate function is a versatile tool for sorting Thai text in a locale-specific manner. It ensures that text data is sorted correctly, taking into account the Thai language’s unique characteristics.

+
+ +
+
+pythainlp.util.count_thai_chars(text: str) dict[source]
+

Count Thai characters by type

+

This function will give you numbers of Thai characters by type (consonants, vowels, lead_vowels, follow_vowels, above_vowels, below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai)

+
+
Parameters:
+

text (str) – Text

+
+
Returns:
+

Dict with numbers of Thai characters by type

+
+
Return type:
+

dict

+
+
Example:
+

+
+
from pythainlp.util import count_thai_chars
+
+count_thai_chars("ทดสอบภาษาไทย")
+# output: {
+# 'vowels': 3,
+# 'lead_vowels': 1,
+# 'follow_vowels': 2,
+# 'above_vowels': 0,
+# 'below_vowels': 0,
+# 'consonants': 9,
+# 'tonemarks': 0,
+# 'signs': 0,
+# 'thai_digits': 0,
+# 'punctuations': 0,
+# 'non_thai': 0
+# }
+
+
+

The count_thai_chars function is a character counting tool specifically tailored for Thai text. It helps in quantifying Thai characters, which can be useful for various text processing tasks.

+
+ +
+
+pythainlp.util.countthai(text: str, ignore_chars: str = ' \t\n\r\x0b\x0c0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~') float[source]
+

Find proportion of Thai characters in a given text

+
+
Parameters:
+
    +
  • text (str) – input text

  • +
  • ignore_chars (str, optional) – characters to be ignored, defaults to whitespace,digits, and punctuation marks.

  • +
+
+
Returns:
+

proportion of Thai characters in the text (percentage)

+
+
Return type:
+

float

+
+
Example:
+

+
+
from pythainlp.util import countthai
+
+countthai("ไทยเอ็นแอลพี 3.0")
+# output: 100.0
+
+countthai("PyThaiNLP 3.0")
+# output: 0.0
+
+countthai("ใช้งาน PyThaiNLP 3.0")
+# output: 40.0
+
+countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="")
+# output: 30.0
+
+
+

The countthai function is a text processing utility for counting the occurrences of Thai characters in text data. This is useful for understanding the prevalence of Thai language content.

+
+ +
+
+pythainlp.util.dict_trie(dict_source: str | Iterable[str] | Trie) Trie[source]
+

Create a dictionary trie from a file or an iterable.

+
+
Parameters:
+

dict_source (str|Iterable[str]|pythainlp.util.Trie) – a path to +dictionary file or a list of words or a pythainlp.util.Trie object

+
+
Returns:
+

a trie object

+
+
Return type:
+

pythainlp.util.Trie

+
+
+

The dict_trie function implements a Trie data structure for efficient dictionary operations. It’s a valuable resource for dictionary management and fast word lookup.

+
+ +
+
+pythainlp.util.digit_to_text(text: str) str[source]
+
+
Parameters:
+

text (str) – Text with digits such as ‘1’, ‘2’, ‘๓’, ‘๔’

+
+
Returns:
+

Text with digits spelled out in Thai

+
+
+

The digit_to_text function is a numeral conversion tool that translates Arabic numerals into their Thai textual representations. This is vital for rendering numbers in Thai text naturally.

+
+ +
+
+pythainlp.util.display_thai_char(ch: str) str[source]
+

Prefix an underscore (_) to a high-position vowel or a tone mark, +to ease readability.

+
+
Parameters:
+

ch (str) – input character

+
+
Returns:
+

“_” + ch

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import display_thai_char
+
+display_thai_char("้")
+# output: "_้"
+
+
+

The display_thai_char function is designed to present Thai characters with diacritics and tonal marks accurately. This is essential for displaying Thai text with correct pronunciation cues.

+
+ +
+
+pythainlp.util.emoji_to_thai(text: str, delimiters=(':', ':')) str[source]
+

This function converts emojis to their Thai meanings

+
+
Parameters:
+

text (str) – Text with emojis

+
+
Returns:
+

Text with emojis converted to their Thai meanings

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import emoji_to_thai
+
+emoji_to_thai("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀")
+# output: จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ
+          ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:
+
+emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣")
+# output: หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:
+
+emoji_to_thai("🇹🇭 นี่คือธงประเทศไทย")
+# output: :ธง_ไทย: นี่คือธงประเทศไทย
+
+
+

The emoji_to_thai function focuses on converting emojis into their Thai language equivalents. This is a unique feature for enhancing text communication with Thai-language emojis.

+
+ +
+
+pythainlp.util.eng_to_thai(text: str) str[source]
+

Corrects the given text that was incorrectly typed using English-US +Qwerty keyboard layout to the originally intended keyboard layout +that is the Thai Kedmanee keyboard.

+
+
Parameters:
+

text (str) – incorrect text input (Thai typed using English keyboard)

+
+
Returns:
+

Thai text with typing using +incorrect keyboard layout is corrected

+
+
Return type:
+

str

+
+
Example:
+

+
+

Intentionally type “ธนาคารแห่งประเทศไทย”, but got “Tok8kicsj’xitgmLwmp”:

+
from pythainlp.util import eng_to_thai
+
+eng_to_thai("Tok8kicsj'xitgmLwmp")
+# output: ธนาคารแห่งประเทศไทย
+
+
+

The eng_to_thai function serves as a text conversion tool for translating English text into its Thai transliterated form. It is beneficial for rendering English words and phrases in a Thai context.

+
+ +
+
+pythainlp.util.find_keyword(word_list: List[str], min_len: int = 3) Dict[str, int][source]
+

This function counts the frequencies of words in the list +where stopword is excluded and returns a frequency dictionary.

+
+
Parameters:
+
    +
  • word_list (list) – a list of words

  • +
  • min_len (int) – the minimum frequency for words to be retained

  • +
+
+
Returns:
+

a dictionary object with key-value pair being words and their raw counts

+
+
Return type:
+

dict[str, int]

+
+
Example:
+

+
+
from pythainlp.util import find_keyword
+
+words = ["บันทึก", "เหตุการณ์", "บันทึก", "เหตุการณ์",
+         " ", "มี", "การ", "บันทึก", "เป็น", " ", "ลายลักษณ์อักษร"
+         "และ", "การ", "บันทึก","เสียง","ใน","เหตุการณ์"]
+
+find_keyword(words)
+# output: {'บันทึก': 4, 'เหตุการณ์': 3}
+
+find_keyword(words, min_len=1)
+# output: {' ': 2, 'บันทึก': 4, 'ลายลักษณ์อักษรและ': 1,
+ 'เสียง': 1, 'เหตุการณ์': 3}
+
+
+

The find_keyword function is a powerful utility for identifying keywords and key phrases in text data. It is a fundamental component for text analysis and information extraction tasks.

+
+ +
+
+pythainlp.util.ipa_to_rtgs(ipa: str) str[source]
+

Convert IPA system to The Royal Thai General System of Transcription (RTGS)

+

Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai

+
+
Parameters:
+

ipa (str) – IPA phoneme

+
+
Returns:
+

The RTGS that is converted, according to rules listed in the Wikipedia page

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import ipa_to_rtgs
+
+print(ipa_to_rtgs("kluaj"))
+# output : 'kluai'
+
+
+

The ipa_to_rtgs function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides.

+
+ +
+
+pythainlp.util.isthai(text: str, ignore_chars: str = '.') bool[source]
+

Check if every character in a string is a Thai character.

+
+
Parameters:
+
    +
  • text (str) – input text

  • +
  • ignore_chars (str, optional) – characters to be ignored, defaults to “.”

  • +
+
+
Returns:
+

True if every character in the input string is Thai, +otherwise False.

+
+
Return type:
+

bool

+
+
Example:
+

+
+
from pythainlp.util import isthai
+
+isthai("กาลเวลา")
+# output: True
+
+isthai("กาลเวลา.")
+# output: True
+
+isthai("กาล-เวลา")
+# output: False
+
+isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,")
+# output: True
+
+
+

The isthai function is a straightforward language detection utility that determines if text contains Thai language content. This function is essential for language-specific text processing.

+
+ +
+
+pythainlp.util.isthaichar(ch: str) bool[source]
+

Check if a character is a Thai character.

+
+
Parameters:
+

ch (str) – input character

+
+
Returns:
+

True if ch is a Thai character, otherwise False.

+
+
Return type:
+

bool

+
+
Example:
+

+
+
from pythainlp.util import isthaichar
+
+isthaichar("ก")  # THAI CHARACTER KO KAI
+# output: True
+
+isthaichar("๕")  # THAI DIGIT FIVE
+# output: True
+
+
+

The isthaichar function is designed to check if a character belongs to the Thai script. It helps in character-level language identification and text processing.

+
+ +
+
+pythainlp.util.maiyamok(sent: str | List[str]) List[str][source]
+

Expand Maiyamok.

+

Deprecated. Use expand_maiyamok() instead.

+

Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word +repetition. This function preprocesses Thai text by replacing +Maiyamok with a word being repeated.

+
+
Parameters:
+

sent (Union[str, List[str]]) – sentence (list or string)

+
+
Returns:
+

list of words

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util import expand_maiyamok
+
+expand_maiyamok("คนๆนก")
+# output: ['คน', 'คน', 'นก']
+
+
+

The maiyamok function is a text processing tool that assists in identifying and processing Thai character characters with a ‘mai yamok’ tone mark.

+
+ +
+
+pythainlp.util.nectec_to_ipa(pronunciation: str) str[source]
+

Convert NECTEC system to IPA system

+
+
Parameters:
+

pronunciation (str) – NECTEC phoneme

+
+
Returns:
+

IPA that is converted

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import nectec_to_ipa
+
+print(nectec_to_ipa("kl-uua-j^-2"))
+# output : 'kl uua j ˥˩'
+
+
+
+

References

+

Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56.

+

The nectec_to_ipa function focuses on converting text from the NECTEC phonetic transcription system to the International Phonetic Alphabet (IPA). This conversion is vital for linguistic analysis and phonetic representation.

+
+
+ +
+
+pythainlp.util.normalize(text: str) str[source]
+

Normalize and clean Thai text with normalizing rules as follows:

+
+
    +
  • Remove zero-width spaces

  • +
  • Remove duplicate spaces

  • +
  • Reorder tone marks and vowels to standard order/spelling

  • +
  • Remove duplicate vowels and signs

  • +
  • Remove duplicate tone marks

  • +
  • Remove dangling non-base characters at the beginning of text

  • +
+
+

normalize() simply call remove_zw(), remove_dup_spaces(), +remove_repeat_vowels(), and remove_dangling(), in that order.

+

If a user wants to customize the selection or the order of rules +to be applied, they can choose to call those functions by themselves.

+

Note: for Unicode normalization, see unicodedata.normalize().

+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

normalized text according to the rules

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import normalize
+
+normalize("เเปลก")  # starts with two Sara E
+# output: แปลก
+
+normalize("นานาาา")
+# output: นานา
+
+
+

The normalize function is a text processing utility that standardizes text by removing diacritics, tonal marks, and other modifications. It is valuable for text normalization and linguistic analysis.

+
+ +
+
+pythainlp.util.now_reign_year() int[source]
+

Return the reign year of the 10th King of Chakri dynasty.

+
+
Returns:
+

reign year of the 10th King of Chakri dynasty.

+
+
Return type:
+

int

+
+
Example:
+

+
+
from pythainlp.util import now_reign_year
+
+text = "เป็นปีที่ {reign_year} ในรัชกาลปัจจุบัน"\
+    .format(reign_year=now_reign_year())
+
+print(text)
+# output: เป็นปีที่ 4 ในรัชการปัจจุบัน
+
+
+

The now_reign_year function computes the current Thai Buddhist Era (BE) year and provides it in a human-readable format. This function is essential for displaying the current year in a Thai context.

+
+ +
+
+pythainlp.util.num_to_thaiword(number: int) str[source]
+

This function converts number to Thai text

+
+
Parameters:
+

number (int) – an integer number to be converted to Thai text

+
+
Returns:
+

text representing the number in Thai

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import num_to_thaiword
+
+num_to_thaiword(1)
+# output: หนึ่ง
+
+num_to_thaiword(11)
+# output: สิบเอ็ด
+
+
+

The num_to_thaiword function is a numeral conversion tool for translating Arabic numerals into Thai word form. It is crucial for rendering numbers in a natural Thai textual format.

+
+ +
+
+pythainlp.util.rank(words: List[str], exclude_stopwords: bool = False) Counter[source]
+

Count word frequencies given a list of Thai words with an option +to exclude stopwords.

+
+
Parameters:
+
    +
  • words (list) – a list of words

  • +
  • exclude_stopwords (bool) – If this parameter is set to True, +exclude stopwords from counting. +Otherwise, the stopwords will be counted. +By default, `exclude_stopwords`is +set to False

  • +
+
+
Returns:
+

a Counter object representing word frequencies in the text

+
+
Return type:
+

collections.Counter

+
+
Example:
+

+
+

Include stopwords when counting word frequencies:

+
from pythainlp.util import rank
+
+words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \
+"เป็น", " ", "ลายลักษณ์อักษร"]
+
+rank(words)
+# output:
+# Counter(
+#     {
+#         ' ': 2,
+#         'การ': 1,
+#         'บันทึก': 2,
+#         'มี': 1,
+#         'ลายลักษณ์อักษร': 1,
+#         'เป็น': 1,
+#         'เหตุการณ์': 1
+#     })
+
+
+

Exclude stopwords when counting word frequencies:

+
from pythainlp.util import rank
+
+words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \
+    "เป็น", " ", "ลายลักษณ์อักษร"]
+
+rank(words)
+# output:
+# Counter(
+#     {
+#         ' ': 2,
+#         'บันทึก': 2,
+#         'ลายลักษณ์อักษร': 1,
+#         'เหตุการณ์': 1
+#     })
+
+
+

The rank function is designed for ranking and ordering a list of items. It is a general-purpose utility for ranking items based on various criteria.

+
+ +
+
+pythainlp.util.reign_year_to_ad(reign_year: int, reign: int) int[source]
+

Convert reign year to AD.

+

Return AD year according to the reign year for +the 7th to 10th King of Chakri dynasty, Thailand. +For instance, the AD year of the 4th reign year of the 10th King is 2019.

+
+
Parameters:
+
    +
  • reign_year (int) – reign year of the King

  • +
  • reign (int) – the reign of the King (i.e. 7, 8, 9, and 10)

  • +
+
+
Returns:
+

the year in AD of the King given the reign and reign year.

+
+
Return type:
+

int

+
+
Example:
+

+
+
from pythainlp.util import reign_year_to_ad
+
+print("The 4th reign year of the King Rama X is in", \
+    reign_year_to_ad(4, 10))
+# output: The 4th reign year of the King Rama X is in 2019
+
+print("The 1st reign year of the King Rama IX is in", \
+    reign_year_to_ad(1, 9))
+# output: The 4th reign year of the King Rama X is in 1946
+
+
+

The reign_year_to_ad function facilitates the conversion of Thai Buddhist Era (BE) years into Western calendar years. This is useful for displaying historical dates in a globally recognized format.

+
+ +
+
+pythainlp.util.remove_dangling(text: str) str[source]
+

Remove Thai non-base characters at the beginning of text.

+

This is a common “typo”, especially for input field in a form, +as these non-base characters can be visually hidden from user +who may accidentally typed them in.

+

A character to be removed should be both:

+
+
    +
  • tone mark, above vowel, below vowel, or non-base sign AND

  • +
  • located at the beginning of the text

  • +
+
+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text without dangling Thai characters at the beginning

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import remove_dangling
+
+remove_dangling("๊ก")
+# output: 'ก'
+
+
+

The remove_dangling function is a text processing tool for removing dangling characters or diacritics from text. It is useful for text cleaning and normalization.

+
+ +
+
+pythainlp.util.remove_dup_spaces(text: str) str[source]
+

Remove duplicate spaces. Replace multiple spaces with one space.

+

Multiple newline characters and empty lines will be replaced +with one newline character.

+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text without duplicated spaces and newlines

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import remove_dup_spaces
+
+remove_dup_spaces("ก    ข    ค")
+# output: 'ก ข ค'
+
+
+

The remove_dup_spaces function focuses on removing duplicate space characters from text data, making it more consistent and readable.

+
+ +
+
+pythainlp.util.remove_repeat_vowels(text: str) str[source]
+

Remove repeating vowels, tone marks, and signs.

+

This function will call reorder_vowels() first, to make sure that +double Sara E will be converted to Sara Ae and not be removed.

+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text without repeating Thai vowels, tone marks, and signs

+
+
Return type:
+

str

+
+
+

The remove_repeat_vowels function is designed to eliminate repeated vowel characters in text, improving text readability and consistency.

+
+ +
+
+pythainlp.util.remove_tone_ipa(ipa: str) str[source]
+

Remove Thai Tones from IPA system

+
+
Parameters:
+

ipa (str) – IPA phoneme

+
+
Returns:
+

IPA phoneme with tones removed

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import remove_tone_ipa
+
+print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"))
+# output : laː.sa.maj
+
+
+

The remove_tone_ipa function serves as a phonetic conversion tool for removing tone marks from IPA transcriptions. This is crucial for phonetic analysis and linguistic research.

+
+ +
+
+pythainlp.util.remove_tonemark(text: str) str[source]
+

Remove all Thai tone marks from the text.

+

Thai script has four tone marks indicating four tones as follows:

+
+
    +
  • Down tone (Thai: ไม้เอก _่ )

  • +
  • Falling tone (Thai: ไม้โท _้ )

  • +
  • High tone (Thai: ไม้ตรี _๊ )

  • +
  • Rising tone (Thai: ไม้จัตวา _๋ )

  • +
+
+

Putting wrong tone mark is a common mistake in Thai writing. +By removing tone marks from the string, it could be used to +for a approximate string matching.

+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text without Thai tone marks

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import remove_tonemark
+
+remove_tonemark("สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด")
+# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
+
+
+

The remove_tonemark function is a utility for removing tonal marks and diacritics from text data, making it suitable for various text processing tasks.

+
+ +
+
+pythainlp.util.remove_zw(text: str) str[source]
+

Remove zero-width characters.

+

These non-visible characters may cause unexpected result from the +user’s point of view. Removing them can make string matching more robust.

+

Characters to be removed:

+
+
    +
  • Zero-width space (ZWSP)

  • +
  • Zero-width non-joiner (ZWJP)

  • +
+
+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text without zero-width characters

+
+
Return type:
+

str

+
+
+

The remove_zw function is designed to remove zero-width characters from text data, ensuring that text is free from invisible or unwanted characters.

+
+ +
+
+pythainlp.util.reorder_vowels(text: str) str[source]
+

Reorder vowels and tone marks to the standard logical order/spelling.

+

Characters in input text will be reordered/transformed, +according to these rules:

+
+
    +
  • Sara E + Sara E -> Sara Ae

  • +
  • Nikhahit + Sara Aa -> Sara Am

  • +
  • tone mark + non-base vowel -> non-base vowel + tone mark

  • +
  • follow vowel + tone mark -> tone mark + follow vowel

  • +
+
+
+
Parameters:
+

text (str) – input text

+
+
Returns:
+

text with vowels and tone marks in the standard logical order

+
+
Return type:
+

str

+
+
+

The reorder_vowels function is a text processing utility for reordering vowel characters in Thai text. It is essential for phonetic analysis and pronunciation guides.

+
+ +
+
+pythainlp.util.rhyme(word: str) List[str][source]
+

Find Thai rhyme

+
+
Parameters:
+

word (str) – A Thai word

+
+
Returns:
+

All list Thai rhyme words

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util import rhyme
+
+print(rhyme("จีบ"))
+# output: ['กลีบ', 'กีบ', 'ครีบ', ...]
+
+
+

The rhyme function is a utility for find rhyme of Thai word.

+
+ +
+
+pythainlp.util.sound_syllable(syllable: str) str[source]
+

Sound syllable classification

+

This function is sound syllable classification. +The syllable is a live syllable or dead syllable.

+
+
Parameters:
+

syllable (str) – Thai syllable

+
+
Returns:
+

syllable’s type (“live” or “dead”)

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import sound_syllable
+
+print(sound_syllable("มา"))
+# output: live
+
+print(sound_syllable("เลข"))
+# output: dead
+
+
+

The sound_syllable function specializes in identifying and processing Thai characters that represent sound syllables. This is valuable for phonetic and linguistic analysis.

+
+ +
+
+pythainlp.util.syllable_length(syllable: str) str[source]
+

Thai syllable length

+

This function is used for finding syllable’s length. (long or short)

+
+
Parameters:
+

syllable (str) – Thai syllable

+
+
Returns:
+

syllable’s length (long or short)

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import syllable_length
+
+print(syllable_length("มาก"))
+# output: long
+
+print(syllable_length("คะ"))
+# output: short
+
+
+

The syllable_length function is a text analysis tool for calculating the length of syllables in Thai text. It is significant for linguistic analysis and language research.

+
+ +
+
+pythainlp.util.syllable_open_close_detector(syllable: str) str[source]
+

Open/close Thai syllables detector

+

This function is used for finding Thai syllables that are open or closed sound.

+
+
Parameters:
+

syllable (str) – Thai syllable

+
+
Returns:
+

open / close

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import syllable_open_close_detector
+
+print(syllable_open_close_detector("มาก"))
+# output: close
+
+print(syllable_open_close_detector("คะ"))
+# output: open
+
+
+

The syllable_open_close_detector function is designed to detect syllable open and close statuses in Thai text. This information is vital for phonetic analysis and linguistic research.

+
+ +
+
+pythainlp.util.text_to_arabic_digit(text: str) str[source]
+

This function converts spelled out digits in Thai to Arabic digits.

+
+
Parameters:
+

text – A digit spelled out in Thai

+
+
Returns:
+

An Arabic digit such as ‘1’, ‘2’, ‘3’ if the text is +digit spelled out in Thai (ศูนย์, หนึ่ง, สอง, …, เก้า). +Otherwise, it returns an empty string.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import text_to_arabic_digit
+
+text_to_arabic_digit("ศูนย์")
+# output: 0
+text_to_arabic_digit("หนึ่ง")
+# output: 1
+text_to_arabic_digit("แปด")
+# output: 8
+text_to_arabic_digit("เก้า")
+# output: 9
+
+# For text that is not digit spelled out in Thai
+text_to_arabic_digit("สิบ") == ""
+# output: True
+text_to_arabic_digit("เก้าร้อย") == ""
+# output: True
+
+
+

The text_to_arabic_digit function is a numeral conversion tool that translates Thai text numerals into Arabic numeral form. It is useful for numerical data extraction and processing.

+
+ +
+
+pythainlp.util.text_to_num(text: str) List[str][source]
+

Thai text to list of Thai words with floating point numbers

+
+
Parameters:
+

text (str) – Thai text with the spelled-out numerals

+
+
Returns:
+

list of Thai words with float values of the input

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util import text_to_num
+
+text_to_num("เก้าร้อยแปดสิบจุดเก้าห้าบาทนี่คือจำนวนทั้งหมด")
+# output: ['980.95', 'บาท', 'นี่', 'คือ', 'จำนวน', 'ทั้งหมด']
+
+text_to_num("สิบล้านสองหมื่นหนึ่งพันแปดร้อยแปดสิบเก้าบาท")
+# output: ['10021889', 'บาท']
+
+
+

The text_to_num function focuses on extracting numerical values from text data. This is essential for converting textual numbers into numerical form for computation.

+
+ +
+
+pythainlp.util.text_to_thai_digit(text: str) str[source]
+

This function converts spelled out digits in Thai to Thai digits.

+
+
Parameters:
+

text – A digit spelled out in Thai

+
+
Returns:
+

A Thai digit such as ‘๑’, ‘๒’, ‘๓’ if the text is digit +spelled out in Thai (ศูนย์, หนึ่ง, สอง, …, เก้า). +Otherwise, it returns an empty string.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import text_to_thai_digit
+
+text_to_thai_digit("ศูนย์")
+# output: ๐
+text_to_thai_digit("หนึ่ง")
+# output: ๑
+text_to_thai_digit("แปด")
+# output: ๘
+text_to_thai_digit("เก้า")
+# output: ๙
+
+# For text that is not Thai digit spelled out
+text_to_thai_digit("สิบ") == ""
+# output: True
+text_to_thai_digit("เก้าร้อย") == ""
+# output: True
+
+
+

The text_to_thai_digit function serves as a numeral conversion tool for translating Arabic numerals into Thai numeral form. This is important for rendering numbers in Thai text naturally.

+
+ +
+
+pythainlp.util.thai_digit_to_arabic_digit(text: str) str[source]
+

This function converts Thai digits (i.e. ๑, ๓, ๑๐) to Arabic digits +(i.e. 1, 3, 10).

+
+
Parameters:
+

text (str) – Text with Thai digits such as ‘๑’, ‘๒’, ‘๓’

+
+
Returns:
+

Text with Thai digits converted to Arabic digits +such as ‘1’, ‘2’, ‘3’

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import thai_digit_to_arabic_digit
+
+text = 'เป็นจำนวน ๑๒๓,๔๐๐.๒๕ บาท'
+
+thai_digit_to_arabic_digit(text)
+# output: เป็นจำนวน 123,400.25 บาท
+
+
+

The thai_digit_to_arabic_digit function allows you to transform Thai numeral text into Arabic numeral format. This is valuable for numerical data extraction and computation tasks.

+
+ +
+
+pythainlp.util.thai_strftime(dt_obj: datetime, fmt: str = '%-d %b %y', thaidigit: bool = False) str[source]
+

Convert datetime.datetime into Thai date and time format.

+

The formatting directives are similar to datatime.strrftime().

+
+
This function uses Thai names and Thai Buddhist Era for these directives:
    +
  • %a - abbreviated weekday name +(i.e. “จ”, “อ”, “พ”, “พฤ”, “ศ”, “ส”, “อา”)

  • +
  • %A - full weekday name +(i.e. “วันจันทร์”, “วันอังคาร”, “วันเสาร์”, “วันอาทิตย์”)

  • +
  • %b - abbreviated month name +(i.e. “ม.ค.”,”ก.พ.”,”มี.ค.”,”เม.ย.”,”พ.ค.”,”มิ.ย.”, “ธ.ค.”)

  • +
  • %B - full month name +(i.e. “มกราคม”, “กุมภาพันธ์”, “พฤศจิกายน”, “ธันวาคม”,)

  • +
  • %y - year without century (i.e. “56”, “10”)

  • +
  • %Y - year with century (i.e. “2556”, “2410”)

  • +
  • %c - date and time representation +(i.e. “พ 6 ต.ค. 01:40:00 2519”)

  • +
  • %v - short date representation +(i.e. “ 6-ม.ค.-2562”, “27-ก.พ.-2555”)

  • +
+
+
+

Other directives will be passed to datetime.strftime()

+
+
Note:
+
    +
  • The Thai Buddhist Era (BE) year is simply converted from AD +by adding 543. This is certainly not accurate for years +before 1941 AD, due to the change in Thai New Year’s Day.

  • +
  • This meant to be an interim solution, since +Python standard’s locale module (which relied on C’s strftime()) +does not support “th” or “th_TH” locale yet. If supported, +we can just locale.setlocale(locale.LC_TIME, “th_TH”) +and then use native datetime.strftime().

  • +
+
+
+

We are trying to make this platform-independent and support extensions +as many as possible. See these links for strftime() extensions +in POSIX, BSD, and GNU libc:

+
+
+
+
Parameters:
+
    +
  • dt_obj (datetime) – an instantiatetd object of +datetime.datetime

  • +
  • fmt (str) – string containing date and time directives

  • +
  • thaidigit (bool) – If thaidigit is set to False (default), +number will be represented in Arabic digit. +If it is set to True, it will be represented +in Thai digit.

  • +
+
+
Returns:
+

Date and time text, with month in Thai name and year in +Thai Buddhist era. The year is simply converted from AD +by adding 543 (will not accurate for years before 1941 AD, +due to change in Thai New Year’s Day).

+
+
Return type:
+

str

+
+
Example:
+

+
+
from datetime import datetime
+from pythainlp.util import thai_strftime
+
+datetime_obj = datetime(year=2019, month=6, day=9, \
+    hour=5, minute=59, second=0, microsecond=0)
+
+print(datetime_obj)
+# output: 2019-06-09 05:59:00
+
+thai_strftime(datetime_obj, "%A %d %B %Y")
+# output: 'วันอาทิตย์ 09 มิถุนายน 2562'
+
+thai_strftime(datetime_obj, "%a %-d %b %y")  # no padding
+# output: 'อา 9 มิ.ย. 62'
+
+thai_strftime(datetime_obj, "%a %_d %b %y")  # space padding
+# output: 'อา  9 มิ.ย. 62'
+
+thai_strftime(datetime_obj, "%a %0d %b %y")  # zero padding
+# output: 'อา 09 มิ.ย. 62'
+
+thai_strftime(datetime_obj, "%-H นาฬิกา %-M นาที", thaidigit=True)
+# output: '๕ นาฬิกา ๕๙ นาที'
+
+thai_strftime(datetime_obj, "%D (%v)")
+# output: '06/09/62 ( 9-มิ.ย.-2562)'
+
+thai_strftime(datetime_obj, "%c")
+# output: 'อา  9 มิ.ย. 05:59:00 2562'
+
+thai_strftime(datetime_obj, "%H:%M %p")
+# output: '01:40 AM'
+
+thai_strftime(datetime_obj, "%H:%M %#p")
+# output: '01:40 am'
+
+
+

The thai_strftime function is a date formatting tool tailored for Thai culture. It is essential for displaying dates and times in a format that adheres to Thai conventions.

+
+ +
+
+pythainlp.util.thai_strptime(text: str, fmt: str, year: str = 'be', add_year: int | None = None, tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok'))[source]
+

Thai strptime

+
+
Parameters:
+
    +
  • text (str) – text

  • +
  • fmt (str) – string containing date and time directives

  • +
  • year (str) – year of the text (ad is Anno Domini and be is Buddhist Era)

  • +
  • add_year (int) – add to year when converting to ad

  • +
  • tzinfo (object) – tzinfo (default is Asia/Bangkok)

  • +
+
+
Returns:
+

The year that is converted to datetime.datetime

+
+
Return type:
+

datetime.datetime

+
+
+
+
The fmt chars that are supported:
    +
  • %d - Day (1 - 31)

  • +
  • %B - Thai month (03, 3, มี.ค., or มีนาคม)

  • +
  • %Y - Year (66, 2566, or 2023)

  • +
  • %H - Hour (0 - 23)

  • +
  • %M - Minute (0 - 59)

  • +
  • %S - Second (0 - 59)

  • +
  • %f - Microsecond

  • +
+
+
+
+
Example:
+

+
+
from pythainlp.util import thai_strptime
+
+thai_strptime("15 ก.ค. 2565 09:00:01","%d %B %Y %H:%M:%S")
+# output:
+# datetime.datetime(
+#   2022,
+#   7,
+#   15,
+#   9,
+#   0,
+#   1,
+#   tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok')
+# )
+
+
+

The thai_strptime function focuses on parsing dates and times in a Thai-specific format, making it easier to work with date and time data in a Thai context.

+
+ +
+
+pythainlp.util.thai_to_eng(text: str) str[source]
+

Corrects the given text that was incorrectly typed using Thai Kedmanee +keyboard layout to the originally intended keyboard layout +that is the English-US Qwerty keyboard.

+
+
Parameters:
+

text (str) – incorrect text input (English typed using Thai keyboard)

+
+
Returns:
+

English text with typing with +incorrect keyboard layout is corrected

+
+
Return type:
+

str

+
+
Example:
+

+
+

Intentionally type “Bank of Thailand”, but got “ฺฟืา นด ธ้ฟรสฟืก”:

+
from pythainlp.util import eng_to_thai
+
+thai_to_eng("ฺฟืา นด ธ้ฟรสฟืก")
+# output: 'Bank of Thailand'
+
+
+

The thai_to_eng function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context.

+
+ +
+
+pythainlp.util.to_idna(text: str) str[source]
+

Encode text with IDNA, as used in Internationalized Domain Name (IDN).

+
+
Parameters:
+

text (str) – Thai text

+
+
Returns:
+

IDNA-encoded text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import to_idna
+
+to_idna("คนละครึ่ง.com")
+# output: 'xn--42caj4e6bk1f5b1j.com'
+
+
+

The to_idna function is a text conversion tool for translating Thai text into its International Domain Name (IDN) for Thai domain name.

+
+ +
+
+pythainlp.util.thai_word_tone_detector(word: str) Tuple[str, str][source]
+

Thai tone detector for word.

+

It uses pythainlp.transliterate.pronunciate for converting word to pronunciation.

+
+
Parameters:
+

word (str) – Thai word.

+
+
Returns:
+

Thai pronunciation with tones in each syllable. (l, m, h, r, f or empty if it cannot be detected)

+
+
Return type:
+

Tuple[str, str]

+
+
Example:
+

+
+
from pythainlp.util import thai_word_tone_detector
+
+print(thai_word_tone_detector("คนดี"))
+# output: [('คน', 'm'), ('ดี', 'm')]
+
+print(thai_word_tone_detector("มือถือ"))
+# output: [('มือ', 'm'), ('ถือ', 'r')]
+
+
+

The thai_word_tone_detector function specializes in detecting and processing tonal marks in Thai words. It is essential for phonetic analysis and pronunciation guides.

+
+ +
+
+pythainlp.util.thaiword_to_date(text: str, date: datetime | None = None) datetime | None[source]
+

Convert Thai relative date to datetime.datetime.

+
+
Parameters:
+
    +
  • text (str) – Thai text containing relative date

  • +
  • date (datetime.datetime) – date (default is datetime.datetime.now())

  • +
+
+
Returns:
+

datetime object, if it can be calculated. Otherwise, None.

+
+
Return type:
+

datetime.datetime

+
+
Example:
+

thaiword_to_date(“พรุ่งนี้”) +# output: +# datetime of tomorrow

+
+
+

The thaiword_to_date function facilitates the conversion of Thai word representations of dates into standardized date formats. This is important for date data extraction and processing.

+
+ +
+
+pythainlp.util.thaiword_to_num(word: str) int[source]
+

Converts the spelled-out numerals in Thai scripts into an actual integer.

+
+
Parameters:
+

word (str) – Spelled-out numerals in Thai scripts

+
+
Returns:
+

Corresponding integer value of the input

+
+
Return type:
+

int

+
+
Example:
+

+
+
from pythainlp.util import thaiword_to_num
+
+thaiword_to_num("ศูนย์")
+# output: 0
+
+thaiword_to_num("สองล้านสามแสนหกร้อยสิบสอง")
+# output: 2300612
+
+
+

The thaiword_to_num function is a numeral conversion tool for translating Thai word numerals into numerical form. This is essential for numerical data extraction and computation.

+
+ +
+
+pythainlp.util.thaiword_to_time(text: str, padding: bool = True) str[source]
+

Convert Thai time in words into time (H:M).

+
+
Parameters:
+
    +
  • text (str) – Thai time in words

  • +
  • padding (bool) – Zero pad the hour if True

  • +
+
+
Returns:
+

time string

+
+
Return type:
+

str

+
+
Example:
+

+
+
thaiword_to_time"บ่ายโมงครึ่ง")
+# output:
+# 13:30
+
+
+

The thaiword_to_time function is designed for converting Thai word representations of time into standardized time formats. It is crucial for time data extraction and processing.

+
+ +
+
+pythainlp.util.time_to_thaiword(time_data: time | datetime | str, fmt: str = '24h', precision: str | None = None) str[source]
+

Spell out time as Thai words.

+
+
Parameters:
+
    +
  • time_data (str) – time input, can be a datetime.time object or a datetime.datetime object or a string (in H:M or H:M:S format, using 24-hour clock)

  • +
  • fmt (str) – time output format +* 24h - 24-hour clock (default) +* 6h - 6-hour clock +* m6h - Modified 6-hour clock

  • +
  • precision (str) – precision of the spell out time +* m - always spell out at minute level +* s - always spell out at second level +* None - spell out only non-zero parts

  • +
+
+
Returns:
+

Time spelled out as Thai words

+
+
Return type:
+

str

+
+
Example:
+

+
+
time_to_thaiword("8:17")
+# output:
+# แปดนาฬิกาสิบเจ็ดนาที
+
+time_to_thaiword("8:17", "6h")
+# output:
+# สองโมงเช้าสิบเจ็ดนาที
+
+time_to_thaiword("8:17", "m6h")
+# output:
+# แปดโมงสิบเจ็ดนาที
+
+time_to_thaiword("18:30", fmt="m6h")
+# output:
+# หกโมงครึ่ง
+
+time_to_thaiword(datetime.time(12, 3, 0))
+# output:
+# สิบสองนาฬิกาสามนาที
+
+time_to_thaiword(datetime.time(12, 3, 0), precision="s")
+# output:
+# สิบสองนาฬิกาสามนาทีศูนย์วินาที
+
+
+

The time_to_thaiword function focuses on converting time values into Thai word representations. This is valuable for rendering time in a natural Thai textual format.

+
+ +
+
+pythainlp.util.tis620_to_utf8(text: str) str[source]
+

Convert TIS-620 to UTF-8

+
+
Parameters:
+

text (str) – TIS-620 encoded text

+
+
Returns:
+

UTF-8 encoded text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import tis620_to_utf8
+
+tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ")
+# output: 'กระทรวงอุตสาหกรรม'
+
+
+

The tis620_to_utf8 function serves as a character encoding conversion tool for converting TIS-620 encoded text into UTF-8 format. This is significant for character encoding compatibility.

+
+ +
+
+pythainlp.util.tone_detector(syllable: str) str[source]
+

Thai tone detector for syllables

+

Return tone of a syllable.

+
    +
  • l: low

  • +
  • m: mid

  • +
  • r: rising

  • +
  • f: falling

  • +
  • h: high

  • +
  • empty string: cannot be detected

  • +
+
+
Parameters:
+

syllable (str) – Thai syllable

+
+
Returns:
+

syllable’s tone (l, m, h, r, f) or empty if it cannot be detected

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import tone_detector
+
+print(tone_detector("มา"))
+# output: m
+
+print(tone_detector("ไม้"))
+# output: h
+
+
+

The tone_detector function is a text processing tool for detecting tone marks and diacritics in Thai text. It is essential for phonetic analysis and pronunciation guides.

+
+ +
+
+pythainlp.util.words_to_num(words: list) float[source]
+

Thai Words to float

+
+
Parameters:
+

text (str) – Thai words

+
+
Returns:
+

float of words

+
+
Return type:
+

float

+
+
Example:
+

+
+
from pythainlp.util import words_to_num
+
+words_to_num(["ห้า", "สิบ", "จุด", "เก้า", "ห้า"])
+# output: 50.95
+
+
+

The words_to_num function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation.

+
+ +
+
+pythainlp.util.thai_consonant_to_spelling(c: str) str[source]
+

Thai consonants to spelling

+
+
Parameters:
+

c (str) – A Thai consonant

+
+
Returns:
+

spelling

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import thai_consonant_to_spelling
+
+print(tone_to_spelling("ก"))
+# output: กอ
+
+
+
+ +
+
+pythainlp.util.tone_to_spelling(t: str) str[source]
+

Thai tonemarks to spelling

+
+
Parameters:
+

t (str) – A Thai tonemarks

+
+
Returns:
+

spelling

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util import tone_to_spelling
+
+print(tone_to_spelling("่")) # ไม้เอก
+# output: ไม้เอก
+
+
+
+ +
+
+pythainlp.util.spell_words.spell_syllable(text: str) List[str][source]
+

Spell out syllables in Thai word distribution form.

+
+
Parameters:
+

s (str) – Thai syllables only

+
+
Returns:
+

List of spelled out syllables

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util.spell_words import spell_syllable
+
+print(spell_syllable("แมว"))
+# output: ['มอ', 'วอ', 'แอ', 'แมว']
+
+
+

The pythainlp.util.spell_words.spell_syllable function focuses on spelling syllables in Thai text, an important feature for phonetic analysis and linguistic research.

+
+ +
+
+pythainlp.util.spell_words.spell_word(text: str) List[str][source]
+

Spell out words in Thai word distribution form.

+
+
Parameters:
+

w (str) – Thai words only

+
+
Returns:
+

List of spelled out words

+
+
Return type:
+

List[str]

+
+
Example:
+

+
+
from pythainlp.util.spell_words import spell_word
+
+print(spell_word("คนดี"))
+# output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']
+
+
+

The pythainlp.util.spell_words.spell_word function is designed for spelling individual words in Thai text, facilitating phonetic analysis and pronunciation guides.

+
+ +
+
+pythainlp.util.to_lunar_date(input_date: date) str[source]
+

Convert the solar date to Thai Lunar Date

+
+
Parameters:
+

input_date (date) – date of the day.

+
+
Returns:
+

Thai text lunar date

+
+
Return type:
+

str

+
+
+

The to_lunar_date function focuses on converts the solar date to Thai Lunar Date.

+
+ +
+
+pythainlp.util.th_zodiac(year: int, output_type: int = 1) str | int[source]
+

Thai Zodiac Year Name +Converts a Gregorian year to its corresponding Zodiac name.

+
+
Parameters:
+
    +
  • year (int) – The Gregorian year. AD (Anno Domini)

  • +
  • output_type (int) – Output type (1 = Thai, 2 = English, 3 = Number).

  • +
+
+
Returns:
+

The Zodiac name or number corresponding to the input year.

+
+
Return type:
+

Union[str, int]

+
+
+

The th_zodiac function is converts a Gregorian year to its corresponding Thai Zodiac name.

+
+ +
+
+class pythainlp.util.Trie(words: Iterable[str])[source]
+

The Trie class is a data structure for efficient dictionary operations. It’s a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.

+
+
+class Node[source]
+
+
+__init__()[source]
+
+ +
+
+end
+
+ +
+
+children
+
+ +
+ +
+
+__init__(words: Iterable[str])[source]
+
+ +
+
+add(word: str) None[source]
+

Add a word to the trie. +Spaces in front of and following the word will be removed.

+
+
Parameters:
+

text (str) – a word

+
+
+
+ +
+
+remove(word: str) None[source]
+

Remove a word from the trie. +If the word is not found, do nothing.

+
+
Parameters:
+

text (str) – a word

+
+
+
+ +
+
+prefixes(text: str) List[str][source]
+

List all possible words from first sequence of characters in a word.

+
+
Parameters:
+

text (str) – a word

+
+
Returns:
+

a list of possible words

+
+
Return type:
+

List[str]

+
+
+
+ +
+ +
+
+pythainlp.util.longest_common_subsequence(str1: str, str2: str) str[source]
+

Find the longest common subsequence between two strings.

+
+
Parameters:
+
    +
  • str1 (str) – The first string.

  • +
  • str2 (str) – The second string.

  • +
+
+
Returns:
+

The longest common subsequence.

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util.lcs import longest_common_subsequence
+
+print(longest_common_subsequence("ABCBDAB", "BDCAB"))
+# output: "BDAB"
+
+
+

The longest_common_subsequence function is find the longest common subsequence between two strings.

+
+ +
+
+pythainlp.util.morse.morse_encode(text: str, lang: str = 'th') str[source]
+

Convert text to Morse code (support Thai and English)

+
+
Parameters:
+
    +
  • text (str) – Text

  • +
  • lang (str) – Language Code (th is Thai and en is English)

  • +
+
+
Returns:
+

Morse code

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util.morse import morse_encode
+print(morse_encode("แมว", lang="th"))
+# output: .-.- -- .--
+
+print(morse_encode("cat", lang="en"))
+# output: -.-. .- -
+
+
+

The pythainlp.util.morse.morse_encode function is convert text to Morse code.

+
+ +
+
+pythainlp.util.morse.morse_decode(morse_text: str, lang: str = 'th') str[source]
+

Simple Convert Morse code to text

+

Thai still have some wrong character problem that can fix by spell corrector.

+
+
Parameters:
+
    +
  • morse_text (str) – Morse code

  • +
  • lang (str) – Language Code (th is Thai and en is English)

  • +
+
+
Returns:
+

Text

+
+
Return type:
+

str

+
+
Example:
+

+
+
from pythainlp.util.morse import morse_decode
+print(morse_decode(".-.- -- .--", lang="th"))
+# output: แมว
+
+print(morse_decode("-.-. .- -", lang="en"))
+# output: CAT
+
+
+

The pythainlp.util.morse.morse_decode function is convert Morse code to text.

+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/wangchanberta.html b/5.1/api/wangchanberta.html new file mode 100644 index 0000000..9626cb2 --- /dev/null +++ b/5.1/api/wangchanberta.html @@ -0,0 +1,299 @@ + + + + + + + + + pythainlp.wangchanberta — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.wangchanberta

+

The pythainlp.wangchanberta module is built upon the WangchanBERTa base model, specifically the wangchanberta-base-att-spm-uncased model, as detailed in the paper by Lowphansirikul et al. [^Lowphansirikul_2021].

+

This base model is utilized for various natural language processing tasks in the Thai language, including named entity recognition, part-of-speech tagging, and subword tokenization.

+

If you intend to fine-tune the model or explore its capabilities further, please refer to the [thai2transformers repository](https://github.com/vistec-AI/thai2transformers).

+

Speed Benchmark

+ + + + + + + + + + + + + + + + + + + + + +

Function

Named Entity Recognition

Part of Speech

PyThaiNLP basic function

89.7 ms

312 ms

pythainlp.wangchanberta (CPU)

9.64 s

9.65 s

pythainlp.wangchanberta (GPU)

8.02 s

8 s

+

For a comprehensive performance benchmark, the following notebooks are available:

+ +
+

Modules

+
+
+class pythainlp.wangchanberta.NamedEntityRecognition(model: str = 'pythainlp/thainer-corpus-v2-base-model')[source]
+

The NamedEntityRecognition class is a fundamental component for identifying named entities in Thai text. It allows you to extract entities such as names, locations, and organizations from text data.

+
+
+__init__(model: str = 'pythainlp/thainer-corpus-v2-base-model') None[source]
+

This function tags named entities in text in IOB format.

+

Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand +:param str model: The model that use wangchanberta pretrained.

+
+ +
+
+get_ner(text: str, pos: bool = False, tag: bool = False) List[Tuple[str, str]] | str[source]
+

This function tags named entities in text in IOB format. +Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand

+
+
Parameters:
+
    +
  • text (str) – text in Thai to be tagged

  • +
  • tag (bool) – output HTML-like tags.

  • +
+
+
Returns:
+

a list of tuples associated with tokenized word groups, NER tags, and output HTML-like tags (if the parameter tag is specified as True). Otherwise, return a list of tuples associated with tokenized words and NER tags

+
+
Return type:
+

Union[list[tuple[str, str]]], str

+
+
+
+ +
+ +
+
+class pythainlp.wangchanberta.ThaiNameTagger(dataset_name: str = 'thainer', grouped_entities: bool = True)[source]
+

The ThaiNameTagger class is designed for tagging Thai names within text. This is essential for tasks such as entity recognition, information extraction, and text classification.

+
+
+__init__(dataset_name: str = 'thainer', grouped_entities: bool = True)[source]
+

This function tags named entities in text in IOB format.

+

Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand

+
+
Parameters:
+
    +
  • dataset_name (str) –

      +
    • thainer - ThaiNER dataset

    • +
    +

  • +
  • grouped_entities (bool) – grouped entities

  • +
+
+
+
+ +
+
+get_ner(text: str, pos: bool = False, tag: bool = False) List[Tuple[str, str]] | str[source]
+

This function tags named entities in text in IOB format. +Powered by wangchanberta from VISTEC-depa AI Research Institute of Thailand

+
+
Parameters:
+
    +
  • text (str) – text in Thai to be tagged

  • +
  • tag (bool) – output HTML-like tags.

  • +
+
+
Returns:
+

a list of tuples associated with tokenized word groups, NER tags, and output HTML-like tags (if the parameter tag is specified as True). Otherwise, return a list of tuples associated with tokenized words and NER tags

+
+
Return type:
+

Union[list[tuple[str, str]]], str

+
+
+
+ +
+ +
+
+pythainlp.wangchanberta.segment(text: str) List[str][source]
+

Subword tokenize. SentencePiece from wangchanberta model.

+
+
Parameters:
+

text (str) – text to be tokenized

+
+
Returns:
+

list of subwords

+
+
Return type:
+

list[str]

+
+
+

The segment function is a subword tokenization tool that breaks down text into subword units, offering a foundation for further text processing and analysis.

+
+ +
+
+

References

+

[^Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. WangchanBERTa: Pretraining transformer-based Thai Language Models. [ArXiv:2101.09635](http://arxiv.org/abs/2101.09635) [Internet]. 2021 Jan 23 [cited 2021 Feb 27].

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/word_vector.html b/5.1/api/word_vector.html new file mode 100644 index 0000000..c016c01 --- /dev/null +++ b/5.1/api/word_vector.html @@ -0,0 +1,509 @@ + + + + + + + + + pythainlp.word_vector — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.word_vector

+

The word_vector contains functions that makes use of a pre-trained vector public data. +The pythainlp.word_vector module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more.

+
+

Dependencies

+

Installation of numpy and gensim is required.

+

Before using this module, you need to ensure that the numpy and gensim libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors.

+
+
+

Modules

+
+
+class pythainlp.word_vector.WordVector(model_name: str = 'thai2fit_wv')[source]
+

Word Vector class

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+
Options for model_name
    +
  • thai2fit_wv (default) - word vector from thai2fit

  • +
  • ltw2v - word vector from LTW2V: The Large Thai Word2Vec v0.1

  • +
  • ltw2v_v1.0_15_window - word vector from LTW2V v1.0 and 15 window

  • +
  • ltw2v_v1.0_5_window - word vector from LTW2V v1.0 and 5 window

  • +
+
+
+

The WordVector class encapsulates word vector operations and functions. It provides a convenient interface for loading models, finding word similarities, and generating sentence vectors.

+
+
+__init__(model_name: str = 'thai2fit_wv') None[source]
+

Word Vector class

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+
Options for model_name
    +
  • thai2fit_wv (default) - word vector from thai2fit

  • +
  • ltw2v - word vector from LTW2V: The Large Thai Word2Vec

  • +
  • ltw2v_v1.0_15_window - word2vec from LTW2V 1.0 and 15 window

  • +
  • ltw2v_v1.0_5_window - word2vec from LTW2V v1.0 and 5 window

  • +
+
+
+
+ +
+
+load_wordvector(model_name: str)[source]
+

Load word vector model.

+
+
Parameters:
+

model_name (str) – model name

+
+
+
+ +
+
+get_model() KeyedVectors[source]
+

Get word vector model.

+
+
Returns:
+

gensim word2vec model

+
+
Return type:
+

gensim.models.keyedvectors.Word2VecKeyedVectors

+
+
+
+ +
+
+doesnt_match(words: List[str]) str[source]
+

This function returns one word that is mostly unrelated to other words +in the list. We use the function doesnt_match() +from gensim.

+
+
Parameters:
+

words (list) – a list of words

+
+
Raises:
+

KeyError – if there is any word in positive or negative that is +not in the vocabulary of the model.

+
+
Returns:
+

the word is that mostly unrelated

+
+
Return type:
+

str

+
+
Note:
+
    +
  • If a word in words is not in the vocabulary, KeyError +will be raised.

  • +
+
+
Example:
+

+
+

Pick the word “พริกไทย” (name of food) out of the list of meals +(“อาหารเช้า”, “อาหารเที่ยง”, “อาหารเย็น”). +>>> from pythainlp.word_vector import WordVector +>>> +>>> wv = WordVector() +>>> words = [‘อาหารเช้า’, ‘อาหารเที่ยง’, ‘อาหารเย็น’, ‘พริกไทย’] +>>> wv.doesnt_match(words) +พริกไทย

+

Pick the word “เรือ” (name of vehicle) out of the list of words +related to occupation (“ดีไซน์เนอร์”, “พนักงานเงินเดือน”, “หมอ”).

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> words = ['ดีไซน์เนอร์', 'พนักงานเงินเดือน', 'หมอ', 'เรือ']
+>>> wv.doesnt_match(words)
+เรือ
+
+
+
+ +
+
+most_similar_cosmul(positive: List[str], negative: List[str]) List[Tuple[str, float]][source]
+

This function finds the top-10 words that are most similar with respect +to two lists of words labeled as positive and negative. +The top-10 most similar words are obtained using multiplication +combination objective from Omer Levy and Yoav Goldberg +[OmerLevy_YoavGoldberg_2014].

+

We use the function gensim.most_similar_cosmul() directly from +gensim.

+
+
Parameters:
+
    +
  • positive (list) – a list of words to add

  • +
  • negative (list) – a list of words to subtract

  • +
+
+
Raises:
+

KeyError – if there is any word in positive or negative that is +not in the vocabulary of the model.

+
+
Returns:
+

list of top-10 most similar words and its similarity score

+
+
Return type:
+

list[tuple[str, float]]

+
+
Note:
+
    +
  • With a single word in the positive list, it will find the +most similar words to the word given (similar +to gensim.most_similar())

  • +
  • If a word in positive or negative is not in the vocabulary, +KeyError will be raised.

  • +
+
+
Example:
+

+
+

Find the top-10 most similar words to the word: “แม่น้ำ”.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['แม่น้ำ']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ลำน้ำ', 0.8206598162651062), ('ทะเลสาบ', 0.775945782661438),
+('ลุ่มน้ำ', 0.7490593194961548), ('คลอง', 0.7471904754638672),
+('ปากแม่น้ำ', 0.7354257106781006), ('ฝั่งแม่น้ำ', 0.7120099067687988),
+('ทะเล', 0.7030453681945801), ('ริมแม่น้ำ', 0.7015200257301331),
+('แหล่งน้ำ', 0.6997432112693787), ('ภูเขา', 0.6960948705673218)]
+
+
+

Find the top-10 most similar words to the words: “นายก”, +“รัฐมนตรี”, and “ประเทศ”.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['นายก', 'รัฐมนตรี', 'ประเทศ']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('รองนายกรัฐมนตรี', 0.2730445861816406),
+('เอกอัครราชทูต', 0.26500266790390015),
+('นายกรัฐมนตรี', 0.2649088203907013),
+('ผู้ว่าราชการจังหวัด', 0.25119125843048096),
+('ผู้ว่าการ', 0.2510434687137604), ('เลขาธิการ', 0.24824175238609314),
+('ผู้ว่า', 0.2453523576259613), ('ประธานกรรมการ', 0.24147476255893707),
+('รองประธาน', 0.24123257398605347), ('สมาชิกวุฒิสภา',
+0.2405330240726471)]
+
+
+

Find the top-10 most similar words when having only positive +list and both positive and negative lists.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ประเทศจีน', 0.22022421658039093), ('เกาหลี', 0.2196873426437378),
+('สหรัฐอเมริกา', 0.21660110354423523),
+('ประเทศญี่ปุ่น', 0.21205860376358032),
+('ประเทศไทย', 0.21159221231937408), ('เกาหลีใต้',
+0.20321202278137207),
+('อังกฤษ', 0.19610872864723206), ('ฮ่องกง', 0.1928885132074356),
+('ฝรั่งเศส', 0.18383873999118805), ('พม่า', 0.18369348347187042)]
+>>>
+>>> list_positive = ['ประเทศ', 'ไทย', 'จีน', 'ญี่ปุ่น']
+>>> list_negative = ['อเมริกา']
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+[('ประเทศไทย', 0.3278159201145172), ('เกาหลี', 0.3201899230480194),
+('ประเทศจีน', 0.31755179166793823), ('พม่า', 0.30845439434051514),
+('ประเทศญี่ปุ่น', 0.306713730096817),
+('เกาหลีใต้', 0.3003999888896942),
+('ลาว', 0.2995176911354065), ('คนไทย', 0.2885020673274994),
+('เวียดนาม', 0.2878379821777344), ('ชาวไทย', 0.28480708599090576)]
+
+
+

The function returns KeyError when the term “เมนูอาหารไทย” +is not in the vocabulary.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> list_positive = ['เมนูอาหารไทย']
+>>> list_negative = []
+>>> wv.most_similar_cosmul(list_positive, list_negative)
+KeyError: "word 'เมนูอาหารไทย' not in vocabulary"
+
+
+
+ +
+
+similarity(word1: str, word2: str) float[source]
+

This function computes cosine similarity between two words.

+
+
Parameters:
+
    +
  • word1 (str) – first word to be compared with

  • +
  • word2 (str) – second word to be compared with

  • +
+
+
Raises:
+

KeyError – if either word1 or word2 is not in the +vocabulary of the model.

+
+
Returns:
+

the cosine similarity between the two word vectors

+
+
Return type:
+

float

+
+
Note:
+
    +
  • If a word in word1 or word2 is not in the vocabulary, +KeyError will be raised.

  • +
+
+
Example:
+

+
+

Compute consine similarity between two words: “รถไฟ” and “รถไฟฟ้า” +(train and electric train).

+
>>> from pythainlp.word_vector import WordVector
+>>> wv = WordVector()
+>>> wv.similarity('รถไฟ', 'รถไฟฟ้า')
+0.43387136
+
+
+

Compute consine similarity between two words: “เสือดาว” and “รถไฟฟ้า” +(leopard and electric train).

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> wv.similarity('เสือดาว', 'รถไฟฟ้า')
+0.04300258
+
+
+
+ +
+
+sentence_vectorizer(text: str, use_mean: bool = True) ndarray[source]
+

This function converts a Thai sentence into vector. +Specifically, it first tokenizes that text and map each tokenized word +with the word vectors from the model. +Then, word vectors are aggregated into one vector of 300 dimension +by calculating either mean or summation of all word vectors.

+
+
Parameters:
+
    +
  • text (str) – text input

  • +
  • use_mean (bool) – if True aggregate word vectors with mean of all +word vectors. Otherwise, aggregate with +summation of all word vectors

  • +
+
+
Returns:
+

300-dimension vector representing the given sentence +in form of numpy array

+
+
Return type:
+

numpy.ndarray((1,300))

+
+
Example:
+

+
+

Vectorize the sentence, “อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735”, +into one sentence vector with two aggregation methods: mean +and summation.

+
>>> from pythainlp.word_vector import WordVector
+>>>
+>>> wv = WordVector()
+>>> sentence = 'อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735'
+>>> wv.sentence_vectorizer(sentence, use_mean=True)
+array([[-0.00421414, -0.08881307,  0.05081136, -0.05632929,
+     -0.06607185, 0.03059357, -0.113882  , -0.00074836,  0.05035743,
+     0.02914307,
+     ...
+    0.02893357,  0.11327957,  0.04562086, -0.05015393,  0.11641257,
+    0.32304936, -0.05054322,  0.03639471, -0.06531371,  0.05048079]])
+>>>
+>>> wv.sentence_vectorizer(sentence, use_mean=False)
+array([[-0.05899798, -1.24338295,  0.711359  , -0.78861002,
+     -0.92500597, 0.42831   , -1.59434797, -0.01047703,  0.705004
+    ,  0.40800299,
+    ...
+    0.40506999,  1.58591403,  0.63869202, -0.702155  ,  1.62977601,
+    4.52269109, -0.70760502,  0.50952601, -0.914392  ,  0.70673105]])
+
+
+
+ +
+ +
+
+

References

+
    +
  • [Omer Levy and Yoav Goldberg (2014). Linguistic Regularities in Sparse and Explicit Word Representations](https://www.aclweb.org/anthology/W14-1618/) +This reference points to the work by Omer Levy and Yoav Goldberg, which discusses linguistic regularities in word representations. It underlines the theoretical foundation of word vectors and their applications in NLP.

  • +
+

This enhanced documentation provides a more detailed and organized overview of the pythainlp.word_vector module, making it a valuable resource for NLP practitioners and researchers working with pre-trained word vectors in the Thai language.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/api/wsd.html b/5.1/api/wsd.html new file mode 100644 index 0000000..9c64a0c --- /dev/null +++ b/5.1/api/wsd.html @@ -0,0 +1,205 @@ + + + + + + + + + pythainlp.wsd — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

pythainlp.wsd

+

The pythainlp.wsd contains get word sense function for Thai Word Sense Disambiguation (WSD). +The pythainlp.wsd module is designed to assist in Word Sense Disambiguation (WSD) for the Thai language. Word Sense Disambiguation is a crucial task in natural language processing that involves determining the correct sense or meaning of a word within a given context. This module provides a function for achieving precisely that.

+
+

Modules

+
+
+pythainlp.wsd.get_sense(sentence: str, word: str, device: str = 'cpu', custom_dict: dict = {}, custom_tokenizer: ~pythainlp.tokenize.core.Tokenizer = <pythainlp.tokenize.core.Tokenizer object>) List[Tuple[str, float]][source]
+

Get word sense from the sentence. +This function will get definition and distance from context in sentence.

+
+
Parameters:
+
    +
  • sentence (str) – Thai sentence

  • +
  • word (str) – Thai word

  • +
  • device (str) – device for running model on.

  • +
  • custom_dict (dict) – Thai dictionary {“word”:[“definition”,..]}

  • +
  • custom_tokenizer (Tokenizer) – Tokenizer used to tokenize words in sentence.

  • +
+
+
Returns:
+

a list of definitions and distances (1 - cos_sim) or an empty list (if word is not in the dictionary)

+
+
Return type:
+

List[Tuple[str, float]]

+
+
+

We get the ideas from Context-Aware Semantic Similarity Measurement for Unsupervised Word Sense Disambiguation to build get_sense function.

+

Use Thai dictionary from wiktionary. +See thai_dict.

+

Use sentence transformers model from sentence-transformers/paraphrase-multilingual-mpnet-base-v2 for unsupervised word sense disambiguation.

+
+
Example:
+

+
+
from pythainlp.wsd import get_sense
+print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
+# output:
+# [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
+#   0.0974416732788086),
+#  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
+#   0.09319090843200684)]
+
+print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))
+# output:
+# [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
+#   0.1005704402923584),
+#  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
+#   0.12473666667938232)]
+
+
+

The get_sense function is the primary tool within this module for performing Word Sense Disambiguation in Thai text. Given a word and its context, this function returns the most suitable sense or meaning for that word. This is particularly useful for tasks where word sense ambiguity needs to be resolved, such as text understanding and translation.

+
+ +

By using the pythainlp.wsd module, you can enhance the accuracy of your NLP applications when dealing with Thai text, ensuring that words are interpreted in the correct context.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/genindex.html b/5.1/genindex.html new file mode 100644 index 0000000..d49570b --- /dev/null +++ b/5.1/genindex.html @@ -0,0 +1,1025 @@ + + + + + + + + Index — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + +

Index

+ +
+ _ + | A + | B + | C + | D + | E + | F + | G + | H + | I + | K + | L + | M + | N + | P + | R + | S + | T + | U + | W + | Z + +
+

_

+ + + +
+ +

A

+ + +
+ +

B

+ + + +
+ +

C

+ + + +
+ +

D

+ + + +
+ +

E

+ + + +
+ +

F

+ + + +
+ +

G

+ + + +
+ +

H

+ + +
+ +

I

+ + + +
+ +

K

+ + + +
+ +

L

+ + + +
+ +

M

+ + + +
+ +

N

+ + + +
+ +

P

+ + + +
    +
  • + pythainlp.tokenize.longest + +
  • +
  • + pythainlp.tokenize.multi_cut + +
  • +
  • + pythainlp.tokenize.nercut + +
  • +
  • + pythainlp.tokenize.newmm + +
  • +
  • + pythainlp.tokenize.nlpo3 + +
  • +
  • + pythainlp.tokenize.oskut + +
  • +
  • + pythainlp.tokenize.pyicu + +
  • +
  • + pythainlp.tokenize.sefr_cut + +
  • +
  • + pythainlp.tokenize.tcc + +
  • +
  • + pythainlp.tokenize.tcc_p + +
  • +
  • + pythainlp.tokenize.thaisumcut + +
  • +
  • PYTHAINLP_DATA_DIR +
  • +
+ +

R

+ + + +
+ +

S

+ + + +
+ +

T

+ + + +
+ +

U

+ + + +
+ +

W

+ + + +
+ +

Z

+ + +
+ + + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/index.html b/5.1/index.html new file mode 100644 index 0000000..6a31ba4 --- /dev/null +++ b/5.1/index.html @@ -0,0 +1,212 @@ + + + + + + + + + PyThaiNLP documentation — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

PyThaiNLP documentation

+
+_images/logo.png + +
+

PyThaiNLP is a Python library for Thai natural language processing (NLP).

+

Website: PyThaiNLP.github.io

+ + +
+
+

Indices and tables

+ +
+
+

Citations

+

If you use PyThaiNLP in your project or publication, please cite the library as follows

+
+

Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. PyThaiNLP: Thai Natural Language Processing in Python. In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.

+
+

Apache Software License 2.0

+

Maintained by the PyThaiNLP team.

+
+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/notes/FAQ.html b/5.1/notes/FAQ.html new file mode 100644 index 0000000..7b01398 --- /dev/null +++ b/5.1/notes/FAQ.html @@ -0,0 +1,148 @@ + + + + + + + + + FAQ — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/5.1/notes/command_line.html b/5.1/notes/command_line.html new file mode 100644 index 0000000..be558fe --- /dev/null +++ b/5.1/notes/command_line.html @@ -0,0 +1,251 @@ + + + + + + + + + Command Line — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Command Line

+

You can use some thainlp functions directly from command line.

+

Tokenization:

+
thainlp tokenize <word|syllable|subword|sent> [-w] [-nw] [-a newmm|attacut|longest] [-s SEPARATOR] TEXT
+
+
+

Example:

+
$ thainlp tokenize word สภาพการจ้างและสภาพการทำงาน
+สภาพการจ้าง|และ|สภาพ|การทำงาน|
+
+$ thainlp tokenize syllable สภาพการจ้างและสภาพการทำงาน
+สภาพ~การ~จ้าง~และ~สภาพ~การ~ทำ~งาน~
+
+$ thainlp tokenize subword สภาพการจ้างและสภาพการทำงาน
+ส/ภา/พ/กา/ร/จ้า/ง/และ/ส/ภา/พ/กา/ร/ทำ/งา/น/
+
+$ thainlp tokenize word -a longest "แรงงานกะดึก: ฟันเฟืองที่ยังหมุนในคำ่คืนมีเคอร์ฟิว"
+แรงงาน|กะ|ดึก|:| |ฟันเฟือง|ที่|ยัง|หมุน|ใน|คำ่|คืน|มี|เคอร์ฟิว|
+
+$ thainlp tokenize word -nw -s "##" "5 เหตุผล 'ไม่ควร' ต่อพ.ร.ก.ฉุกเฉิน"
+5##เหตุผล##'##ไม่##ควร##'##ต่อ##พ.ร.ก.##ฉุกเฉิน##
+
+$ thainlp tokenize sent "หลายปีที่ผ่านมา ชาวชุมชนโคกยาวหลายคนได้พากันย้ายออก บ้างก็เสียชีวิต บางคนถูกจำคุกในข้อบุกรุกป่าหรือแม้กระทั่งสูญหาย"
+หลายปีที่ผ่านมา @@ชาวชุมชนโคกยาวหลายคนได้พากันย้ายออก @@บ้างก็เสียชีวิต @@บางคนถูกจำคุกในข้อบุกรุกป่าหรือแม้กระทั่งสูญหาย@@
+
+
+

Part-Of-Speech tagging:

+
pythainlp tagg pos [-s SEPARATOR] TEXT
+
+
+

Example:

+
$ thainlp tag pos -s . ผม.ไม่.กิน.เผ็ด
+
+
+

Soundex:

+
thainlp soundex [-a udom83|lk82|metasound] TEXT
+
+
+

Example:

+
$ thainlp soundex วรรณ
+ว330000
+
+$ thainlp soundex -a lk82 วัน
+ว4000
+
+$ thainlp soundex -a lk82 วรรณ
+ว4000
+
+
+

Corpus management:

+
thainlp data <catalog|info|get|rm|path>
+
+
+

Example:

+
$ thainlp data path
+/Users/user1/pythainlp-data
+
+$ thainlp data catalog
+Dataset/corpus available for download:
+- crfcut 0.1
+- thai-g2p 0.1  (Local: 0.1)
+- thai2fit_wv 0.1
+- thainer-1-3 1.3
+
+$ thainlp data get thai2fit_wv
+Corpus: thai2fit_wv
+- Downloading: thai2fit_wv 0.1
+36%|█████████████████▉                                |
+
+$ thainlp data --help
+
+
+

Benchmark:

+
thainlp  benchmark word-tokenization --input-file <source> --test-file <label> [--save-details]
+
+
+

Example:

+
$thainlp  benchmark word-tokenization --input-file wisesight-1000-deepcut.txt --test-file wisesight-1000.label
+Benchmarking wisesight-1000-deepcut.txt against .wisesight-1000.label with 993 samples in total
+============== Benchmark Result ==============
+                       char_level:tp 17654.0000
+                       char_level:fn 1153.0000
+                       char_level:tn 50755.0000
+                       char_level:fp 1478.0000
+                char_level:precision 0.9227
+                   char_level:recall 0.9387
+    word_level:total_words_in_sample 19132.0000
+word_level:total_words_in_ref_sample 18807.0000
+word_level:correctly_tokenised_words 15637.0000
+                word_level:precision 0.8173
+                   word_level:recall 0.8314
+
+
+

Misspell:

+
thainlp misspell --file <input_file> [--seed <seed>] [--misspell-ratio <ratio>] [--output <output_file>]
+
+
+

Example:

+
$ thainlp misspell --file ./some/data.txt --seed=1 --misspell-ratio 0.05
+# output file: ./some/data-misspelled-r0.05-seed1.txt
+
+
+

Help:

+
thainlp --help
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/notes/getting_started.html b/5.1/notes/getting_started.html new file mode 100644 index 0000000..37f7419 --- /dev/null +++ b/5.1/notes/getting_started.html @@ -0,0 +1,167 @@ + + + + + + + + + Getting Started — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Getting Started

+

PyThaiNLP is a Python library for natural language processing (NLP) of Thai language. With this package, you can perform NLP tasks such as text classification and text tokenization.

+

Tokenization Example:

+
from pythainlp.tokenize import word_tokenize
+
+text = "โอเคบ่เรารักภาษาถิ่น"
+word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'เรา', 'รัก', 'ภาษาถิ่น']
+word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'เรา', 'รัก', 'ภาษา', 'ถิ่น']
+
+
+

Thai has historically faced a lot of NLP challenges. A quick list of them include as follows:

+
    +
  1. Start-end of sentence marking - This is arguably the biggest problem for the field of Thai NLP. The lack of end of sentence marking (EOS) makes it hard for researchers to create training sets, the basis of most research in this field. The root of the problem is two-pronged. In terms of writing system, Thai uses space to indicate both commas and periods. No letter indicates an end of a sentence. In terms of language use, Thais have a habit of starting their sentences with connector terms such as ‘because’, ‘but’, ‘following’, etc, making it often hard even for natives to decide where the end of sentence should be when translating.

  2. +
  3. Word segmentation - Thai does not use space and word segmentation is not easy. It boils down to understanding the context and ruling out words that do not make sense. This is a similar issue that other Asian languages such as Japanese and Chinese face in different degrees. For languages with space, a similar but less extreme problem would be multi-word expressions, like the French word for potato — ‘pomme de terre’. In Thai, the best known example is “ตา-กลม” and “ตาก-ลม”. As of recent, new techniques that capture words, subwords, and letters in vectors seem poised to overcome to issue.

  4. +
+
+
+

Tutorial Notebooks

+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/notes/installation.html b/5.1/notes/installation.html new file mode 100644 index 0000000..219ccdf --- /dev/null +++ b/5.1/notes/installation.html @@ -0,0 +1,247 @@ + + + + + + + + + Installation — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Installation

+

For stable version:

+
pip install pythainlp
+
+
+

For development version:

+
pip install --upgrade --pre pythainlp
+
+
+

For some functionalities, like named entity recognition, extra packages may be needed. Install them with these install options:

+
+

pip install pythainlp[extra1,extra2,…]

+
+
+
where extras can be
    +
  • attacut (to support attacut, a fast and accurate tokenizer)

  • +
  • benchmarks (to support benchmarks)

  • +
  • icu (for ICU, International Components for Unicode, support in transliteration and tokenization)

  • +
  • ipa (for IPA, International Phonetic Alphabet, support in transliteration)

  • +
  • ml (to support ULMFiT models for classification)

  • +
  • ssg (to support ssg for syllable tokenizer)

  • +
  • thai2fit (for Thai word vector)

  • +
  • thai2rom (for machine-learnt romanization)

  • +
  • translate (to support translate)

  • +
  • wangchanberta (to support wangchanberta models)

  • +
  • mt5 (to mt5 models for Thai text summarizer)

  • +
  • wordnet (to support wordnet)

  • +
  • spell (to support phunspell & symspellpy)

  • +
  • generate (to support text generate with umlfit or thai2fit)

  • +
  • textaugment (to support text augmentation)

  • +
  • oskut (to support OSKUT)

  • +
  • nlpo3 (to support nlpo3 engine)

  • +
  • spacy_thai (to support spacy_thai engine)

  • +
  • esupar (to support esupar engine)

  • +
  • transformers_ud (to support transformers_ud engine)

  • +
  • dependency_parsing (to support dependency parsing with all engine)

  • +
  • coreference_resolution (to support coreference esolution with all engine)

  • +
  • wangchanglm (to support wangchanglm model)

  • +
  • wsd (to support pythainlp.wsd)

  • +
  • el (to support pythainlp.el)

  • +
  • abbreviation (to support pythainlp.util.abbreviation_to_full_text)

  • +
  • full (install everything)

  • +
+
+
+

For dependency details, look at extras variable in setup.py.

+

Note for installation on Windows:

+
    +
  • PyICU libraries may required. You have two options to get them installed on Windows.

  • +
  • +
    Option 1 (recommended):
      +
    • Find a pre-built package (“wheel”) from https://www.lfd.uci.edu/~gohlke/pythonlibs/

    • +
    • Download a suitable wheel for your Python version (3.5, 3.6, etc.) and CPU architecture (“win32” for 32-bit Windows and “amd64” for 64-bit Windows)

    • +
    • Install them with pip. For example: pip install PyICU-xxx‑cp36‑cp36m‑win32.whl

    • +
    +
    +
    +
  • +
  • +
    Option 2 (advanced):
      +
    • You can also try to install them with a command: pip install pyicu

    • +
    • With this, pip will try to build the libraries directly from source files.

    • +
    • This will take some time and need a set of build tools to be installed in your system, for example Microsoft Visual C++ Compiler. It also requires some technical skills on how things are getting built on Windows system, as you may need to configure some environment variables to accommodate the build process.

    • +
    • For PyICU, before the installation, you have to set ICU_VERSION environment variable to ICU version in your system. For example, set ICU_VERSION=62.1.

    • +
    • This approach is obviously take more time and effort, but the good side is the library will be optimized for your system. This could mean a better performance.

    • +
    +
    +
    +
  • +
+
+

Runtime Configurations

+
+
+PYTHAINLP_DATA_DIR
+

This environment variable specifies the location where the downloaded data +and the corpus database information are stored. If this directory +does not exist, PyThaiNLP will automatically create a new one.

+

By default, it is specified to the directory called pythainlp-data +within the home directory.

+

Type thainlp data path at command line to see current PYTHAINLP_DATA_DIR.

+
+ +
+
+PYTHAINLP_READ_MODE
+

This environment variable specifies config PyThaiNLP to read-only mode. (0 = False, 1 = True)

+
+ +
+
+
+

FAQ

+

Q: How to install pythainlp in Python 3.10+? +A: For Python 3.10+, We have python-crfsuite problem in pythainlp, so you can read about python-crfsuite dependency fails to build under python 3.10.

+

Q: How to set the environment variables of each executor node in a distributed environment? +A: You can read PermissionError: [Errno 13] Permission denied: /home/pythainlp-data.

+

Q: How to fixed PyThaiNLP to read-only mode? +A: You can config PYTHAINLP_READ_MODE.

+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/notes/license.html b/5.1/notes/license.html new file mode 100644 index 0000000..de8ab0f --- /dev/null +++ b/5.1/notes/license.html @@ -0,0 +1,168 @@ + + + + + + + + + License — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

License

+ +

PyThaiNLP License

+
+

Copyright 2016 - 2025 PyThaiNLP

+

Licensed under the Apache License, Version 2.0 (the “License”); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at

+
+
+

Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/objects.inv b/5.1/objects.inv new file mode 100644 index 0000000..f49684f Binary files /dev/null and b/5.1/objects.inv differ diff --git a/5.1/py-modindex.html b/5.1/py-modindex.html new file mode 100644 index 0000000..6552ad0 --- /dev/null +++ b/5.1/py-modindex.html @@ -0,0 +1,237 @@ + + + + + + + + Python Module Index — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + +

Python Module Index

+ +
+ p +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
+ p
+ pythainlp +
    + pythainlp.summarize.keybert +
    + pythainlp.tokenize.attacut +
    + pythainlp.tokenize.crfcut +
    + pythainlp.tokenize.etcc +
    + pythainlp.tokenize.han_solo +
    + pythainlp.tokenize.longest +
    + pythainlp.tokenize.multi_cut +
    + pythainlp.tokenize.nercut +
    + pythainlp.tokenize.newmm +
    + pythainlp.tokenize.nlpo3 +
    + pythainlp.tokenize.oskut +
    + pythainlp.tokenize.pyicu +
    + pythainlp.tokenize.sefr_cut +
    + pythainlp.tokenize.tcc +
    + pythainlp.tokenize.tcc_p +
    + pythainlp.tokenize.thaisumcut +
+ + +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/5.1/search.html b/5.1/search.html new file mode 100644 index 0000000..284d3d7 --- /dev/null +++ b/5.1/search.html @@ -0,0 +1,157 @@ + + + + + + + + Search — PyThaiNLP v5.1.0 documentation + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + + + +
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2017-2025, PyThaiNLP (Apache Software License 2.0).

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/5.1/searchindex.js b/5.1/searchindex.js new file mode 100644 index 0000000..c4b10a9 --- /dev/null +++ b/5.1/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"alltitles": {"Additional Functions": [[0, "additional-functions"], [4, "additional-functions"]], "BPEmbAug Class": [[0, "bpembaug-class"], [4, "bpembaug-class"]], "Bigram": [[11, "bigram"]], "Citations": [[30, "citations"]], "Command Line": [[32, null]], "ConceptNet": [[9, "conceptnet"]], "Coreference Resolution Function": [[8, "coreference-resolution-function"]], "DEFAULT_SPELL_CHECKER": [[18, "default-spell-checker"]], "Definition": [[9, "definition"]], "Dependencies": [[2, "dependencies"], [28, "dependencies"]], "EntityLinker": [[10, "entitylinker"]], "Example": [[10, "example"], [11, "example"], [12, "example"], [15, "example"]], "FAQ": [[31, null], [34, "faq"]], "FastTextAug and Thai2transformersAug Classes": [[0, "fasttextaug-and-thai2transformersaug-classes"], [4, "fasttextaug-and-thai2transformersaug-classes"]], "Functions": [[5, "functions"]], "Getting Started": [[33, null]], "Indices and tables": [[30, "indices-and-tables"]], "Installation": [[34, null]], "Introduction": [[0, "introduction"], [4, "introduction"], [5, "introduction"], [8, "introduction"]], "KeyBERT": [[19, "module-pythainlp.summarize.keybert"]], "Keyword Extraction Engines": [[19, "keyword-extraction-engines"]], "KhaveeVerifier": [[12, "khaveeverifier"]], "License": [[35, null]], "Modules": [[1, "modules"], [2, "modules"], [3, "modules"], [9, "modules"], [11, "modules"], [12, "modules"], [13, "modules"], [15, "modules"], [16, "modules"], [17, "modules"], [18, "modules"], [19, "modules"], [20, "modules"], [21, "modules"], [22, "modules"], [23, "modules"], [24, "modules"], [25, "modules"], [26, "modules"], [27, "modules"], [28, "modules"], [29, "modules"]], "NorvigSpellChecker": [[18, "norvigspellchecker"]], "Notes": [[30, null]], "OSCAR": [[9, "oscar"]], "Package reference:": [[30, null]], "PyThaiNLP documentation": [[30, null]], "Quality Evaluation": [[5, "quality-evaluation"]], "References": [[1, "references"], [2, "references"], [17, "references"], [18, "references"], [20, "references"], [24, "references"], [26, "references"], [27, "references"], [28, "references"]], "Runtime Configurations": [[34, "runtime-configurations"]], "Sentence level": [[21, "sentence-level"]], "Subword level": [[21, "subword-level"]], "Synset": [[9, "synset"]], "Tagger Engines": [[20, "tagger-engines"]], "Tokenization": [[5, "tokenization"]], "Tokenization Engines": [[21, "tokenization-engines"]], "Transliteration Engines": [[1, "transliteration-engines"], [24, "transliteration-engines"]], "Trigram": [[11, "trigram"]], "Tutorial Notebooks": [[33, "tutorial-notebooks"]], "Unigram": [[11, "unigram"]], "Usage": [[5, "usage"], [8, "usage"], [11, "usage"], [15, "usage"]], "Util": [[9, "util"]], "Word level": [[21, "word-level"]], "Word2VecAug, Thai2fitAug, LTW2VAug Classes": [[0, "word2vecaug-thai2fitaug-ltw2vaug-classes"], [4, "word2vecaug-thai2fitaug-ltw2vaug-classes"]], "WordNet": [[9, "wordnet"]], "WordNetAug Class": [[0, "wordnetaug-class"], [4, "wordnetaug-class"]], "correct": [[18, "correct"]], "correct_sent": [[18, "correct-sent"]], "countries": [[9, "countries"]], "dependency_parsing": [[15, "dependency-parsing"]], "download": [[9, "download"]], "find_synonym": [[9, "find-synonym"]], "get_corpus": [[9, "get-corpus"]], "get_corpus_as_is": [[9, "get-corpus-as-is"]], "get_corpus_db": [[9, "get-corpus-db"]], "get_corpus_db_detail": [[9, "get-corpus-db-detail"]], "get_corpus_default_db": [[9, "get-corpus-default-db"]], "get_corpus_path": [[9, "get-corpus-path"]], "lk82": [[17, "lk82"]], "metasound": [[17, "metasound"]], "perceptron": [[20, "perceptron"]], "prayut_and_somchaip": [[17, "prayut-and-somchaip"]], "provinces": [[9, "provinces"]], "pythainlp.ancient": [[3, null]], "pythainlp.augment": [[0, null], [4, null]], "pythainlp.benchmarks": [[5, null]], "pythainlp.chat": [[6, null]], "pythainlp.classify": [[7, null]], "pythainlp.coref": [[8, null]], "pythainlp.corpus": [[9, null]], "pythainlp.corpus.conceptnet.edges": [[9, "pythainlp-corpus-conceptnet-edges"]], "pythainlp.corpus.oscar.unigram_word_freqs": [[9, "pythainlp-corpus-oscar-unigram-word-freqs"]], "pythainlp.corpus.oscar.word_freqs": [[9, "pythainlp-corpus-oscar-word-freqs"]], "pythainlp.corpus.th_en_translit.get_transliteration_dict": [[9, "pythainlp-corpus-th-en-translit-get-transliteration-dict"]], "pythainlp.corpus.tnc.bigram_word_freqs": [[9, "pythainlp-corpus-tnc-bigram-word-freqs"]], "pythainlp.corpus.tnc.trigram_word_freqs": [[9, "pythainlp-corpus-tnc-trigram-word-freqs"]], "pythainlp.corpus.tnc.unigram_word_freqs": [[9, "pythainlp-corpus-tnc-unigram-word-freqs"]], "pythainlp.corpus.tnc.word_freqs": [[9, "pythainlp-corpus-tnc-word-freqs"]], "pythainlp.corpus.ttc.unigram_word_freqs": [[9, "pythainlp-corpus-ttc-unigram-word-freqs"]], "pythainlp.corpus.ttc.word_freqs": [[9, "pythainlp-corpus-ttc-word-freqs"]], "pythainlp.corpus.util.find_badwords": [[9, "pythainlp-corpus-util-find-badwords"]], "pythainlp.corpus.util.revise_newmm_default_wordset": [[9, "pythainlp-corpus-util-revise-newmm-default-wordset"]], "pythainlp.corpus.util.revise_wordset": [[9, "pythainlp-corpus-util-revise-wordset"]], "pythainlp.corpus.wordnet.all_lemma_names": [[9, "pythainlp-corpus-wordnet-all-lemma-names"]], "pythainlp.corpus.wordnet.all_synsets": [[9, "pythainlp-corpus-wordnet-all-synsets"]], "pythainlp.corpus.wordnet.custom_lemmas": [[9, "pythainlp-corpus-wordnet-custom-lemmas"]], "pythainlp.corpus.wordnet.langs": [[9, "pythainlp-corpus-wordnet-langs"]], "pythainlp.corpus.wordnet.lch_similarity": [[9, "pythainlp-corpus-wordnet-lch-similarity"]], "pythainlp.corpus.wordnet.lemma": [[9, "pythainlp-corpus-wordnet-lemma"]], "pythainlp.corpus.wordnet.lemma_from_key": [[9, "pythainlp-corpus-wordnet-lemma-from-key"]], "pythainlp.corpus.wordnet.lemmas": [[9, "pythainlp-corpus-wordnet-lemmas"]], "pythainlp.corpus.wordnet.morphy": [[9, "pythainlp-corpus-wordnet-morphy"]], "pythainlp.corpus.wordnet.path_similarity": [[9, "pythainlp-corpus-wordnet-path-similarity"]], "pythainlp.corpus.wordnet.synset": [[9, "pythainlp-corpus-wordnet-synset"]], "pythainlp.corpus.wordnet.synsets": [[9, "pythainlp-corpus-wordnet-synsets"]], "pythainlp.corpus.wordnet.wup_similarity": [[9, "pythainlp-corpus-wordnet-wup-similarity"]], "pythainlp.el": [[10, null]], "pythainlp.generate": [[11, null]], "pythainlp.generate.thai2fit.gen_sentence": [[11, "pythainlp-generate-thai2fit-gen-sentence"]], "pythainlp.generate.wangchanglm.WangChanGLM": [[11, "pythainlp-generate-wangchanglm-wangchanglm"]], "pythainlp.khavee": [[12, null]], "pythainlp.lm": [[13, null]], "pythainlp.morpheme": [[14, null]], "pythainlp.parse": [[15, null]], "pythainlp.phayathaibert": [[16, null]], "pythainlp.soundex": [[17, null]], "pythainlp.soundex.sound.audio_vector": [[17, "pythainlp-soundex-sound-audio-vector"]], "pythainlp.soundex.sound.word2audio": [[17, "pythainlp-soundex-sound-word2audio"]], "pythainlp.soundex.sound.word_approximation": [[17, "pythainlp-soundex-sound-word-approximation"]], "pythainlp.spell": [[18, null]], "pythainlp.summarize": [[19, null]], "pythainlp.tag": [[20, null]], "pythainlp.tokenize": [[21, null]], "pythainlp.tools": [[22, null]], "pythainlp.translate": [[23, null]], "pythainlp.transliterate": [[1, null], [24, null]], "pythainlp.ulmfit": [[25, null]], "pythainlp.util": [[26, null]], "pythainlp.wangchanberta": [[27, null]], "pythainlp.word_vector": [[2, null], [28, null]], "pythainlp.wsd": [[29, null]], "remove": [[9, "remove"]], "soundex": [[17, "soundex"]], "spell": [[18, "spell"]], "spell_sent": [[18, "spell-sent"]], "thai_dict": [[9, "thai-dict"]], "thai_family_names": [[9, "thai-family-names"]], "thai_female_names": [[9, "thai-female-names"]], "thai_male_names": [[9, "thai-male-names"]], "thai_negations": [[9, "thai-negations"]], "thai_orst_words": [[9, "thai-orst-words"]], "thai_stopwords": [[9, "thai-stopwords"]], "thai_syllables": [[9, "thai-syllables"]], "thai_synonyms": [[9, "thai-synonyms"]], "thai_words": [[9, "thai-words"]], "thai_wsd_dict": [[9, "thai-wsd-dict"]], "udom83": [[17, "udom83"]], "unigram": [[20, "unigram"]]}, "docnames": ["api/.ipynb_checkpoints/augment-checkpoint", "api/.ipynb_checkpoints/transliterate-checkpoint", "api/.ipynb_checkpoints/word_vector-checkpoint", "api/ancient", "api/augment", "api/benchmarks", "api/chat", "api/classify", "api/coref", "api/corpus", "api/el", "api/generate", "api/khavee", "api/lm", "api/morpheme", "api/parse", "api/phayathaibert", "api/soundex", "api/spell", "api/summarize", "api/tag", "api/tokenize", "api/tools", "api/translate", "api/transliterate", "api/ulmfit", "api/util", "api/wangchanberta", "api/word_vector", "api/wsd", "index", "notes/FAQ", "notes/command_line", "notes/getting_started", "notes/installation", "notes/license"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1}, "filenames": ["api/.ipynb_checkpoints/augment-checkpoint.rst", "api/.ipynb_checkpoints/transliterate-checkpoint.rst", "api/.ipynb_checkpoints/word_vector-checkpoint.rst", "api/ancient.rst", "api/augment.rst", "api/benchmarks.rst", "api/chat.rst", "api/classify.rst", "api/coref.rst", "api/corpus.rst", "api/el.rst", "api/generate.rst", "api/khavee.rst", "api/lm.rst", "api/morpheme.rst", "api/parse.rst", "api/phayathaibert.rst", "api/soundex.rst", "api/spell.rst", "api/summarize.rst", "api/tag.rst", "api/tokenize.rst", "api/tools.rst", "api/translate.rst", "api/transliterate.rst", "api/ulmfit.rst", "api/util.rst", "api/wangchanberta.rst", "api/word_vector.rst", "api/wsd.rst", "index.rst", "notes/FAQ.rst", "notes/command_line.rst", "notes/getting_started.rst", "notes/installation.rst", "notes/license.rst"], "indexentries": {"__init__() (pythainlp.generate.bigram method)": [[11, "pythainlp.generate.Bigram.__init__", false]], "__init__() (pythainlp.generate.trigram method)": [[11, "pythainlp.generate.Trigram.__init__", false]], "__init__() (pythainlp.generate.unigram method)": [[11, "pythainlp.generate.Unigram.__init__", false]], "__init__() (pythainlp.generate.wangchanglm.wangchanglm method)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM.__init__", false]], "__init__() (pythainlp.tag.ner method)": [[20, "pythainlp.tag.NER.__init__", false]], "__init__() (pythainlp.tag.nner method)": [[20, "pythainlp.tag.NNER.__init__", false]], "__init__() (pythainlp.tag.thainer.thainametagger method)": [[20, "pythainlp.tag.thainer.ThaiNameTagger.__init__", false]], "__init__() (pythainlp.tokenize.attacut.attacuttokenizer method)": [[21, "pythainlp.tokenize.attacut.AttacutTokenizer.__init__", false]], "__init__() (pythainlp.tokenize.han_solo.featurizer method)": [[21, "pythainlp.tokenize.han_solo.Featurizer.__init__", false]], "__init__() (pythainlp.tokenize.longest.longestmatchtokenizer method)": [[21, "pythainlp.tokenize.longest.LongestMatchTokenizer.__init__", false]], "__init__() (pythainlp.tokenize.multi_cut.latticestring method)": [[21, "pythainlp.tokenize.multi_cut.LatticeString.__init__", false]], "__init__() (pythainlp.tokenize.tokenizer method)": [[21, "pythainlp.tokenize.Tokenizer.__init__", false]], "__init__() (pythainlp.transliterate.wunsen.wunsentransliterate method)": [[1, "pythainlp.transliterate.wunsen.WunsenTransliterate.__init__", false], [24, "pythainlp.transliterate.wunsen.WunsenTransliterate.__init__", false]], "attacuttokenizer (class in pythainlp.tokenize.attacut)": [[21, "pythainlp.tokenize.attacut.AttacutTokenizer", false]], "bigram (class in pythainlp.generate)": [[11, "pythainlp.generate.Bigram", false]], "chunk_parse() (in module pythainlp.tag)": [[20, "pythainlp.tag.chunk_parse", false]], "display_cell_tokenize (class in pythainlp.tokenize)": [[21, "pythainlp.tokenize.display_cell_tokenize", false]], "extract_features() (in module pythainlp.tokenize.crfcut)": [[21, "pythainlp.tokenize.crfcut.extract_features", false]], "featurize() (pythainlp.tokenize.han_solo.featurizer method)": [[21, "pythainlp.tokenize.han_solo.Featurizer.featurize", false]], "featurizer (class in pythainlp.tokenize.han_solo)": [[21, "pythainlp.tokenize.han_solo.Featurizer", false]], "find_all_segment() (in module pythainlp.tokenize.multi_cut)": [[21, "pythainlp.tokenize.multi_cut.find_all_segment", false]], "gen_instruct() (pythainlp.generate.wangchanglm.wangchanglm method)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM.gen_instruct", false]], "gen_sentence() (pythainlp.generate.bigram method)": [[11, "pythainlp.generate.Bigram.gen_sentence", false]], "gen_sentence() (pythainlp.generate.trigram method)": [[11, "pythainlp.generate.Trigram.gen_sentence", false]], "gen_sentence() (pythainlp.generate.unigram method)": [[11, "pythainlp.generate.Unigram.gen_sentence", false]], "get_ner() (pythainlp.tag.thainer.thainametagger method)": [[20, "pythainlp.tag.thainer.ThaiNameTagger.get_ner", false]], "instruct_generate() (pythainlp.generate.wangchanglm.wangchanglm method)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM.instruct_generate", false]], "is_exclude() (pythainlp.generate.wangchanglm.wangchanglm method)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM.is_exclude", false]], "latticestring (class in pythainlp.tokenize.multi_cut)": [[21, "pythainlp.tokenize.multi_cut.LatticeString", false]], "list_to_string() (in module pythainlp.tokenize.thaisumcut)": [[21, "pythainlp.tokenize.thaisumcut.list_to_string", false]], "load_dict() (in module pythainlp.tokenize.nlpo3)": [[21, "pythainlp.tokenize.nlpo3.load_dict", false]], "load_engine() (pythainlp.tag.ner method)": [[20, "pythainlp.tag.NER.load_engine", false]], "load_engine() (pythainlp.tag.nner method)": [[20, "pythainlp.tag.NNER.load_engine", false]], "load_model() (pythainlp.generate.wangchanglm.wangchanglm method)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM.load_model", false]], "longestmatchtokenizer (class in pythainlp.tokenize.longest)": [[21, "pythainlp.tokenize.longest.LongestMatchTokenizer", false]], "middle_cut() (in module pythainlp.tokenize.thaisumcut)": [[21, "pythainlp.tokenize.thaisumcut.middle_cut", false]], "mmcut() (in module pythainlp.tokenize.multi_cut)": [[21, "pythainlp.tokenize.multi_cut.mmcut", false]], "module": [[21, "module-pythainlp.tokenize.attacut", false], [21, "module-pythainlp.tokenize.crfcut", false], [21, "module-pythainlp.tokenize.etcc", false], [21, "module-pythainlp.tokenize.han_solo", false], [21, "module-pythainlp.tokenize.longest", false], [21, "module-pythainlp.tokenize.multi_cut", false], [21, "module-pythainlp.tokenize.nercut", false], [21, "module-pythainlp.tokenize.newmm", false], [21, "module-pythainlp.tokenize.nlpo3", false], [21, "module-pythainlp.tokenize.oskut", false], [21, "module-pythainlp.tokenize.pyicu", false], [21, "module-pythainlp.tokenize.sefr_cut", false], [21, "module-pythainlp.tokenize.tcc", false], [21, "module-pythainlp.tokenize.tcc_p", false], [21, "module-pythainlp.tokenize.thaisumcut", false]], "ner (class in pythainlp.tag)": [[20, "pythainlp.tag.NER", false]], "nner (class in pythainlp.tag)": [[20, "pythainlp.tag.NNER", false]], "pad() (pythainlp.tokenize.han_solo.featurizer method)": [[21, "pythainlp.tokenize.han_solo.Featurizer.pad", false]], "pos_tag() (in module pythainlp.tag)": [[20, "pythainlp.tag.pos_tag", false]], "pos_tag_sents() (in module pythainlp.tag)": [[20, "pythainlp.tag.pos_tag_sents", false]], "prob() (pythainlp.generate.bigram method)": [[11, "pythainlp.generate.Bigram.prob", false]], "prob() (pythainlp.generate.trigram method)": [[11, "pythainlp.generate.Trigram.prob", false]], "pythainlp.tokenize.attacut": [[21, "module-pythainlp.tokenize.attacut", false]], "pythainlp.tokenize.crfcut": [[21, "module-pythainlp.tokenize.crfcut", false]], "pythainlp.tokenize.etcc": [[21, "module-pythainlp.tokenize.etcc", false]], "pythainlp.tokenize.han_solo": [[21, "module-pythainlp.tokenize.han_solo", false]], "pythainlp.tokenize.longest": [[21, "module-pythainlp.tokenize.longest", false]], "pythainlp.tokenize.multi_cut": [[21, "module-pythainlp.tokenize.multi_cut", false]], "pythainlp.tokenize.nercut": [[21, "module-pythainlp.tokenize.nercut", false]], "pythainlp.tokenize.newmm": [[21, "module-pythainlp.tokenize.newmm", false]], "pythainlp.tokenize.nlpo3": [[21, "module-pythainlp.tokenize.nlpo3", false]], "pythainlp.tokenize.oskut": [[21, "module-pythainlp.tokenize.oskut", false]], "pythainlp.tokenize.pyicu": [[21, "module-pythainlp.tokenize.pyicu", false]], "pythainlp.tokenize.sefr_cut": [[21, "module-pythainlp.tokenize.sefr_cut", false]], "pythainlp.tokenize.tcc": [[21, "module-pythainlp.tokenize.tcc", false]], "pythainlp.tokenize.tcc_p": [[21, "module-pythainlp.tokenize.tcc_p", false]], "pythainlp.tokenize.thaisumcut": [[21, "module-pythainlp.tokenize.thaisumcut", false]], "romanize() (in module pythainlp.transliterate.royin)": [[1, "pythainlp.transliterate.royin.romanize", false], [24, "pythainlp.transliterate.royin.romanize", false]], "romanize() (in module pythainlp.transliterate.thai2rom)": [[1, "pythainlp.transliterate.thai2rom.romanize", false], [24, "pythainlp.transliterate.thai2rom.romanize", false]], "segment() (in module pythainlp.tokenize.attacut)": [[21, "pythainlp.tokenize.attacut.segment", false]], "segment() (in module pythainlp.tokenize.crfcut)": [[21, "pythainlp.tokenize.crfcut.segment", false]], "segment() (in module pythainlp.tokenize.etcc)": [[21, "pythainlp.tokenize.etcc.segment", false]], "segment() (in module pythainlp.tokenize.han_solo)": [[21, "pythainlp.tokenize.han_solo.segment", false]], "segment() (in module pythainlp.tokenize.longest)": [[21, "pythainlp.tokenize.longest.segment", false]], "segment() (in module pythainlp.tokenize.multi_cut)": [[21, "pythainlp.tokenize.multi_cut.segment", false]], "segment() (in module pythainlp.tokenize.nercut)": [[21, "pythainlp.tokenize.nercut.segment", false]], "segment() (in module pythainlp.tokenize.newmm)": [[21, "pythainlp.tokenize.newmm.segment", false]], "segment() (in module pythainlp.tokenize.nlpo3)": [[21, "pythainlp.tokenize.nlpo3.segment", false]], "segment() (in module pythainlp.tokenize.oskut)": [[21, "pythainlp.tokenize.oskut.segment", false]], "segment() (in module pythainlp.tokenize.pyicu)": [[21, "pythainlp.tokenize.pyicu.segment", false]], "segment() (in module pythainlp.tokenize.sefr_cut)": [[21, "pythainlp.tokenize.sefr_cut.segment", false]], "segment() (in module pythainlp.tokenize.tcc)": [[21, "pythainlp.tokenize.tcc.segment", false]], "segment() (in module pythainlp.tokenize.tcc_p)": [[21, "pythainlp.tokenize.tcc_p.segment", false]], "set_tokenize_engine() (pythainlp.tokenize.tokenizer method)": [[21, "pythainlp.tokenize.Tokenizer.set_tokenize_engine", false]], "split_into_sentences() (pythainlp.tokenize.thaisumcut.thaisentencesegmentor method)": [[21, "pythainlp.tokenize.thaisumcut.ThaiSentenceSegmentor.split_into_sentences", false]], "tag() (pythainlp.tag.ner method)": [[20, "pythainlp.tag.NER.tag", false]], "tag() (pythainlp.tag.nner method)": [[20, "pythainlp.tag.NNER.tag", false]], "tag_provinces() (in module pythainlp.tag)": [[20, "pythainlp.tag.tag_provinces", false]], "tcc() (in module pythainlp.tokenize.tcc)": [[21, "pythainlp.tokenize.tcc.tcc", false]], "tcc() (in module pythainlp.tokenize.tcc_p)": [[21, "pythainlp.tokenize.tcc_p.tcc", false]], "tcc_pos() (in module pythainlp.tokenize.tcc)": [[21, "pythainlp.tokenize.tcc.tcc_pos", false]], "tcc_pos() (in module pythainlp.tokenize.tcc_p)": [[21, "pythainlp.tokenize.tcc_p.tcc_pos", false]], "thainametagger (class in pythainlp.tag.thainer)": [[20, "pythainlp.tag.thainer.ThaiNameTagger", false]], "thaisentencesegmentor (class in pythainlp.tokenize.thaisumcut)": [[21, "pythainlp.tokenize.thaisumcut.ThaiSentenceSegmentor", false]], "tokenize() (pythainlp.tokenize.attacut.attacuttokenizer method)": [[21, "pythainlp.tokenize.attacut.AttacutTokenizer.tokenize", false]], "tokenize() (pythainlp.tokenize.longest.longestmatchtokenizer method)": [[21, "pythainlp.tokenize.longest.LongestMatchTokenizer.tokenize", false]], "tokenizer (class in pythainlp.tokenize)": [[21, "pythainlp.tokenize.Tokenizer", false]], "transliterate() (pythainlp.transliterate.wunsen.wunsentransliterate method)": [[1, "pythainlp.transliterate.wunsen.WunsenTransliterate.transliterate", false], [24, "pythainlp.transliterate.wunsen.WunsenTransliterate.transliterate", false]], "trigram (class in pythainlp.generate)": [[11, "pythainlp.generate.Trigram", false]], "unigram (class in pythainlp.generate)": [[11, "pythainlp.generate.Unigram", false]], "wangchanglm (class in pythainlp.generate.wangchanglm)": [[11, "pythainlp.generate.wangchanglm.WangChanGLM", false]], "word_tokenize() (pythainlp.tokenize.tokenizer method)": [[21, "pythainlp.tokenize.Tokenizer.word_tokenize", false]], "wunsentransliterate (class in pythainlp.transliterate.wunsen)": [[1, "pythainlp.transliterate.wunsen.WunsenTransliterate", false], [24, "pythainlp.transliterate.wunsen.WunsenTransliterate", false]]}, "objects": {"": [[34, 6, 1, "-", "PYTHAINLP_DATA_DIR"], [34, 6, 1, "-", "PYTHAINLP_READ_MODE"]], "pythainlp.ancient": [[3, 0, 1, "", "aksonhan_to_current"]], "pythainlp.augment": [[4, 1, 1, "", "WordNetAug"]], "pythainlp.augment.WordNetAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "find_synonyms"]], "pythainlp.augment.lm": [[4, 1, 1, "", "FastTextAug"], [4, 1, 1, "", "Thai2transformersAug"]], "pythainlp.augment.lm.FastTextAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "modify_sent"], [4, 2, 1, "", "tokenize"]], "pythainlp.augment.lm.Thai2transformersAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "generate"]], "pythainlp.augment.word2vec": [[4, 1, 1, "", "LTW2VAug"], [4, 1, 1, "", "Thai2fitAug"], [4, 1, 1, "", "Word2VecAug"]], "pythainlp.augment.word2vec.LTW2VAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "load_w2v"], [4, 2, 1, "", "tokenizer"]], "pythainlp.augment.word2vec.Thai2fitAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "load_w2v"], [4, 2, 1, "", "tokenizer"]], "pythainlp.augment.word2vec.Word2VecAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "modify_sent"]], "pythainlp.augment.word2vec.bpemb_wv": [[4, 1, 1, "", "BPEmbAug"]], "pythainlp.augment.word2vec.bpemb_wv.BPEmbAug": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "augment"], [4, 2, 1, "", "load_w2v"], [4, 2, 1, "", "tokenizer"]], "pythainlp.benchmarks.word_tokenization": [[5, 0, 1, "", "benchmark"], [5, 0, 1, "", "compute_stats"], [5, 0, 1, "", "preprocessing"]], "pythainlp.chat": [[6, 1, 1, "", "ChatBotModel"]], "pythainlp.chat.ChatBotModel": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "chat"], [6, 2, 1, "", "load_model"], [6, 2, 1, "", "reset_chat"]], "pythainlp.classify": [[7, 1, 1, "", "GzipModel"]], "pythainlp.classify.GzipModel": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "load"], [7, 2, 1, "", "predict"], [7, 2, 1, "", "save"], [7, 2, 1, "", "train"]], "pythainlp.coref": [[8, 0, 1, "", "coreference_resolution"]], "pythainlp.el": [[10, 1, 1, "", "EntityLinker"]], "pythainlp.el.EntityLinker": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "get_el"]], "pythainlp.generate": [[11, 1, 1, "", "Bigram"], [11, 1, 1, "", "Trigram"], [11, 1, 1, "", "Unigram"]], "pythainlp.generate.Bigram": [[11, 2, 1, "", "__init__"], [11, 2, 1, "", "gen_sentence"], [11, 2, 1, "", "prob"]], "pythainlp.generate.Trigram": [[11, 2, 1, "", "__init__"], [11, 2, 1, "", "gen_sentence"], [11, 2, 1, "", "prob"]], "pythainlp.generate.Unigram": [[11, 2, 1, "", "__init__"], [11, 2, 1, "", "gen_sentence"]], "pythainlp.generate.wangchanglm": [[11, 1, 1, "", "WangChanGLM"]], "pythainlp.generate.wangchanglm.WangChanGLM": [[11, 2, 1, "", "__init__"], [11, 2, 1, "", "gen_instruct"], [11, 2, 1, "", "instruct_generate"], [11, 2, 1, "", "is_exclude"], [11, 2, 1, "", "load_model"]], "pythainlp.khavee": [[12, 1, 1, "", "KhaveeVerifier"]], "pythainlp.khavee.KhaveeVerifier": [[12, 3, 1, "", "__dict__"], [12, 2, 1, "", "__init__"], [12, 3, 1, "", "__module__"], [12, 2, 1, "", "check_aek_too"], [12, 2, 1, "", "check_karu_lahu"], [12, 2, 1, "", "check_klon"], [12, 2, 1, "", "check_marttra"], [12, 2, 1, "", "check_sara"], [12, 2, 1, "", "handle_karun_sound_silence"], [12, 2, 1, "", "is_sumpus"]], "pythainlp.lm": [[13, 0, 1, "", "calculate_ngram_counts"], [13, 0, 1, "", "remove_repeated_ngrams"]], "pythainlp.morpheme": [[14, 0, 1, "", "nighit"]], "pythainlp.parse": [[15, 0, 1, "", "dependency_parsing"]], "pythainlp.phayathaibert": [[16, 1, 1, "", "NamedEntityTagger"], [16, 1, 1, "", "PartOfSpeechTagger"], [16, 1, 1, "", "ThaiTextAugmenter"], [16, 1, 1, "", "ThaiTextProcessor"], [16, 0, 1, "", "segment"]], "pythainlp.phayathaibert.NamedEntityTagger": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "get_ner"]], "pythainlp.phayathaibert.PartOfSpeechTagger": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "get_tag"]], "pythainlp.phayathaibert.ThaiTextAugmenter": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "augment"], [16, 2, 1, "", "generate"]], "pythainlp.phayathaibert.ThaiTextProcessor": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "preprocess"], [16, 2, 1, "", "remove_space"], [16, 2, 1, "", "replace_newlines"], [16, 2, 1, "", "replace_rep_after"], [16, 2, 1, "", "replace_spaces"], [16, 2, 1, "", "replace_url"], [16, 2, 1, "", "replace_wrep_post"], [16, 2, 1, "", "rm_brackets"], [16, 2, 1, "", "rm_useless_spaces"]], "pythainlp.soundex": [[17, 0, 1, "", "lk82"], [17, 0, 1, "", "metasound"], [17, 0, 1, "", "prayut_and_somchaip"], [17, 0, 1, "", "soundex"], [17, 0, 1, "", "udom83"]], "pythainlp.soundex.sound": [[17, 0, 1, "", "audio_vector"], [17, 0, 1, "", "word2audio"], [17, 0, 1, "", "word_approximation"]], "pythainlp.spell": [[18, 4, 1, "", "DEFAULT_SPELL_CHECKER"], [18, 1, 1, "", "NorvigSpellChecker"], [18, 0, 1, "", "correct"], [18, 0, 1, "", "correct_sent"], [18, 0, 1, "", "spell"], [18, 0, 1, "", "spell_sent"]], "pythainlp.spell.NorvigSpellChecker": [[18, 3, 1, "", "__dict__"], [18, 2, 1, "", "__init__"], [18, 3, 1, "", "__module__"], [18, 2, 1, "", "correct"], [18, 2, 1, "", "dictionary"], [18, 2, 1, "", "freq"], [18, 2, 1, "", "known"], [18, 2, 1, "", "prob"], [18, 2, 1, "", "spell"]], "pythainlp.summarize": [[19, 0, 1, "", "extract_keywords"], [19, 5, 0, "-", "keybert"], [19, 0, 1, "", "summarize"]], "pythainlp.summarize.keybert": [[19, 1, 1, "id0", "KeyBERT"]], "pythainlp.summarize.keybert.KeyBERT": [[19, 2, 1, "id5", "__init__"], [19, 2, 1, "id9", "embed"], [19, 2, 1, "id6", "extract_keywords"]], "pythainlp.tag": [[20, 1, 1, "", "NER"], [20, 1, 1, "", "NNER"], [20, 0, 1, "", "chunk_parse"], [20, 0, 1, "", "pos_tag"], [20, 0, 1, "", "pos_tag_sents"], [20, 0, 1, "", "tag_provinces"]], "pythainlp.tag.NER": [[20, 2, 1, "", "__init__"], [20, 2, 1, "", "load_engine"], [20, 2, 1, "", "tag"]], "pythainlp.tag.NNER": [[20, 2, 1, "", "__init__"], [20, 2, 1, "", "load_engine"], [20, 2, 1, "", "tag"]], "pythainlp.tag.thainer": [[20, 1, 1, "", "ThaiNameTagger"]], "pythainlp.tag.thainer.ThaiNameTagger": [[20, 2, 1, "", "__init__"], [20, 2, 1, "", "get_ner"]], "pythainlp.tokenize": [[21, 1, 1, "", "Tokenizer"], [21, 5, 0, "-", "attacut"], [21, 5, 0, "-", "crfcut"], [21, 1, 1, "", "display_cell_tokenize"], [21, 5, 0, "-", "etcc"], [21, 5, 0, "-", "han_solo"], [21, 5, 0, "-", "longest"], [21, 5, 0, "-", "multi_cut"], [21, 5, 0, "-", "nercut"], [21, 5, 0, "-", "newmm"], [21, 5, 0, "-", "nlpo3"], [21, 5, 0, "-", "oskut"], [21, 5, 0, "-", "pyicu"], [21, 5, 0, "-", "sefr_cut"], [21, 5, 0, "-", "tcc"], [21, 5, 0, "-", "tcc_p"], [21, 5, 0, "-", "thaisumcut"]], "pythainlp.tokenize.Tokenizer": [[21, 2, 1, "", "__init__"], [21, 2, 1, "", "set_tokenize_engine"], [21, 2, 1, "", "word_tokenize"]], "pythainlp.tokenize.attacut": [[21, 1, 1, "", "AttacutTokenizer"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.attacut.AttacutTokenizer": [[21, 2, 1, "", "__init__"], [21, 2, 1, "", "tokenize"]], "pythainlp.tokenize.crfcut": [[21, 0, 1, "", "extract_features"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.etcc": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.han_solo": [[21, 1, 1, "", "Featurizer"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.han_solo.Featurizer": [[21, 2, 1, "", "__init__"], [21, 2, 1, "", "featurize"], [21, 2, 1, "", "pad"]], "pythainlp.tokenize.longest": [[21, 1, 1, "", "LongestMatchTokenizer"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.longest.LongestMatchTokenizer": [[21, 2, 1, "", "__init__"], [21, 2, 1, "", "tokenize"]], "pythainlp.tokenize.multi_cut": [[21, 1, 1, "", "LatticeString"], [21, 0, 1, "", "find_all_segment"], [21, 0, 1, "", "mmcut"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.multi_cut.LatticeString": [[21, 2, 1, "", "__init__"]], "pythainlp.tokenize.nercut": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.newmm": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.nlpo3": [[21, 0, 1, "", "load_dict"], [21, 0, 1, "", "segment"]], "pythainlp.tokenize.oskut": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.pyicu": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.sefr_cut": [[21, 0, 1, "", "segment"]], "pythainlp.tokenize.tcc": [[21, 0, 1, "", "segment"], [21, 0, 1, "", "tcc"], [21, 0, 1, "", "tcc_pos"]], "pythainlp.tokenize.tcc_p": [[21, 0, 1, "", "segment"], [21, 0, 1, "", "tcc"], [21, 0, 1, "", "tcc_pos"]], "pythainlp.tokenize.thaisumcut": [[21, 1, 1, "", "ThaiSentenceSegmentor"], [21, 0, 1, "", "list_to_string"], [21, 0, 1, "", "middle_cut"]], "pythainlp.tokenize.thaisumcut.ThaiSentenceSegmentor": [[21, 2, 1, "", "split_into_sentences"]], "pythainlp.translate": [[23, 1, 1, "", "Translate"]], "pythainlp.translate.Translate": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "load_model"], [23, 2, 1, "", "translate"]], "pythainlp.translate.en_th": [[23, 1, 1, "", "EnThTranslator"], [23, 1, 1, "", "ThEnTranslator"]], "pythainlp.translate.en_th.EnThTranslator": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "translate"]], "pythainlp.translate.en_th.ThEnTranslator": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "translate"]], "pythainlp.translate.th_fr": [[23, 1, 1, "", "ThFrTranslator"]], "pythainlp.translate.th_fr.ThFrTranslator": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "translate"]], "pythainlp.translate.zh_th": [[23, 1, 1, "", "ThZhTranslator"], [23, 1, 1, "", "ZhThTranslator"]], "pythainlp.translate.zh_th.ThZhTranslator": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "translate"]], "pythainlp.translate.zh_th.ZhThTranslator": [[23, 2, 1, "", "__init__"], [23, 2, 1, "", "translate"]], "pythainlp.transliterate.royin": [[24, 0, 1, "", "romanize"]], "pythainlp.transliterate.thai2rom": [[24, 0, 1, "", "romanize"]], "pythainlp.transliterate.wunsen": [[24, 1, 1, "", "WunsenTransliterate"]], "pythainlp.transliterate.wunsen.WunsenTransliterate": [[24, 2, 1, "", "__init__"], [24, 2, 1, "", "transliterate"]], "pythainlp.ulmfit": [[25, 1, 1, "", "ThaiTokenizer"]], "pythainlp.ulmfit.ThaiTokenizer": [[25, 2, 1, "", "__init__"], [25, 2, 1, "", "add_special_cases"], [25, 2, 1, "", "tokenizer"]], "pythainlp.util": [[26, 1, 1, "", "Trie"], [26, 0, 1, "", "thai_consonant_to_spelling"], [26, 0, 1, "", "tone_to_spelling"]], "pythainlp.util.Trie": [[26, 1, 1, "", "Node"], [26, 2, 1, "", "__init__"], [26, 2, 1, "", "add"], [26, 2, 1, "", "prefixes"], [26, 2, 1, "", "remove"]], "pythainlp.util.Trie.Node": [[26, 2, 1, "", "__init__"], [26, 3, 1, "", "children"], [26, 3, 1, "", "end"]], "pythainlp.wangchanberta": [[27, 1, 1, "", "NamedEntityRecognition"], [27, 1, 1, "", "ThaiNameTagger"]], "pythainlp.wangchanberta.NamedEntityRecognition": [[27, 2, 1, "", "__init__"], [27, 2, 1, "", "get_ner"]], "pythainlp.wangchanberta.ThaiNameTagger": [[27, 2, 1, "", "__init__"], [27, 2, 1, "", "get_ner"]], "pythainlp.word_vector": [[28, 1, 1, "", "WordVector"]], "pythainlp.word_vector.WordVector": [[28, 2, 1, "", "__init__"], [28, 2, 1, "", "doesnt_match"], [28, 2, 1, "", "get_model"], [28, 2, 1, "", "load_wordvector"], [28, 2, 1, "", "most_similar_cosmul"], [28, 2, 1, "", "sentence_vectorizer"], [28, 2, 1, "", "similarity"]], "pythainlp.wsd": [[29, 0, 1, "", "get_sense"]]}, "objnames": {"0": ["py", "function", "Python function"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "attribute", "Python attribute"], "4": ["py", "data", "Python data"], "5": ["py", "module", "Python module"], "6": ["std", "envvar", "environment variable"]}, "objtypes": {"0": "py:function", "1": "py:class", "2": "py:method", "3": "py:attribute", "4": "py:data", "5": "py:module", "6": "std:envvar"}, "terms": {"": [0, 1, 4, 8, 9, 10, 11, 12, 15, 18, 20, 21, 22, 24, 25, 26, 27, 32], "0": [0, 1, 2, 4, 5, 8, 9, 10, 11, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 34, 35], "00": [9, 26], "000": [18, 20], "0000": 32, "0006959172792052158": 18, "00074836": [2, 28], "001": 11, "0011": 25, "00421414": [2, 28], "01": [9, 26], "01047703": [2, 28], "01234567890": 26, "02": [9, 25, 27], "02893357": [2, 28], "02914307": [2, 28], "03": [9, 26], "03059357": [2, 28], "03639471": [2, 28], "04300258": [2, 28], "04562086": [2, 28], "05": [18, 22, 26, 32], "05015393": [2, 28], "05035743": [2, 28], "05048079": [2, 28], "05054322": [2, 28], "05055": 20, "05081136": [2, 28], "05632929": [2, 28], "05899798": [2, 28], "05gb": 9, "06": [9, 26], "06531371": [2, 28], "06607185": [2, 28], "07": 9, "07142857142857142": 9, "08": 9, "08333333333333333": 9, "08881307": [2, 28], "09": 26, "09319090843200684": 29, "09635": 27, "0974416732788086": 29, "0_15_window": [2, 28], "0_5_window": [2, 28], "0d": 26, "0e46": 26, "1": [0, 1, 2, 4, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 32, 34, 35], "10": [2, 8, 9, 19, 20, 26, 28, 34], "100": [22, 23, 26], "1000": 32, "100000": [0, 4], "10021889": 26, "1005704402923584": 29, "103": 20, "1031": 21, "10400": 20, "10th": 26, "11": [9, 10, 17, 20, 21, 26], "11130": 20, "11327957": [2, 28], "113882": [2, 28], "114k": 9, "1153": 32, "11641257": [2, 28], "1180": 14, "11940": [1, 24], "12": [25, 26], "1200": 20, "122": 26, "123": 26, "12473666667938232": 29, "127": 21, "12l": 21, "13": [26, 34], "13333333333333333": 9, "14": [9, 20], "1478": 32, "14ibg": 21, "15": [2, 20, 25, 26, 28], "1526795099383855": 9, "1550": 25, "15637": 32, "16": 20, "1618": [2, 28], "166736255494822": 23, "17": [9, 26], "1727": 22, "17654": 32, "18": [9, 17, 26], "1826": 22, "18369348347187042": [2, 28], "18383873999118805": [2, 28], "18807": 32, "189": 20, "19": [8, 21], "19132": 32, "1928885132074356": [2, 28], "1941": 26, "1946": 26, "19610872864723206": [2, 28], "198": 20, "1982": 17, "1983": 17, "1998": 17, "1999": 20, "1l": 21, "1st": 26, "1utqgxxmrxor9jp1b1jcq1frbnvorhtbq": 21, "1v1z657_5eswpo8rlfvrwa0a5e4vkg7si": 21, "2": [0, 4, 9, 11, 12, 13, 15, 17, 18, 19, 20, 21, 25, 26, 30, 34, 35], "20": [20, 23], "200": 26, "2000": [20, 21], "2001": 21, "2004": 20, "2006": [1, 24], "2007": 18, "2008": 20, "2009": 17, "2014": [2, 28], "2016": [21, 35], "2019": 26, "2020": [20, 21], "2021": [21, 27], "2022": [17, 26], "2023": [7, 26, 30], "2025": 35, "20321202278137207": [2, 28], "20773497763458507": 19, "21": [20, 21, 26], "2101": 27, "21159221231937408": [2, 28], "21205860376358032": [2, 28], "21660110354423523": [2, 28], "2196873426437378": [2, 28], "22022421658039093": [2, 28], "22162962875299588": 19, "22425422716846247": 19, "23": [10, 26, 27], "2300612": 26, "23191285218852364": 19, "234": 21, "234\u0e1a\u0e32\u0e1719": 21, "23876962942443258": 19, "24": 26, "2405330240726471": [2, 28], "2410": 26, "24123257398605347": [2, 28], "24147476255893707": [2, 28], "243": 17, "24338295": [2, 28], "2438": 26, "2453523576259613": [2, 28], "24824175238609314": [2, 28], "2499": 20, "24996827960821494": 19, "24h": 26, "25": [9, 25, 26, 30], "2510434687137604": [2, 28], "25119125843048096": [2, 28], "2519": 26, "2526": 17, "2535": [1, 24], "2543": [1, 24], "2549": [1, 24], "255": 17, "2551": 9, "2555": 26, "2556": 26, "2561": [1, 24], "2562": 26, "2565": 26, "2566": 26, "26": 9, "2649088203907013": [2, 28], "26500266790390015": [2, 28], "2670": 20, "2678430841321016": 19, "27": [26, 27], "2730445861816406": [2, 28], "28480708599090576": [2, 28], "2878379821777344": [2, 28], "2885020673274994": [2, 28], "29115434699705506": 19, "2995176911354065": [2, 28], "3": [0, 4, 9, 11, 15, 16, 17, 19, 20, 21, 25, 26, 32, 34], "30": [18, 20, 26], "300": [0, 2, 4, 28], "3003999888896942": [2, 28], "30301809310913086": 10, "306713730096817": [2, 28], "30845439434051514": [2, 28], "31": 26, "312": 27, "31320597838000375": 19, "31755179166793823": [2, 28], "32": [9, 34], "3201899230480194": [2, 28], "3223": 18, "3228477063109462": 19, "32304936": [2, 28], "3278159201145172": [2, 28], "32\u0e19": 21, "3333333333333333": 9, "35294117647058826": 9, "36": [30, 32], "3639": 18, "37": 21, "3734": 26, "375": 17, "3rd": 30, "4": [1, 2, 9, 11, 12, 13, 15, 17, 20, 23, 24, 25, 26, 28, 35], "40": [18, 21, 26], "400": [25, 26], "40506999": [2, 28], "40800299": [2, 28], "41": 9, "42831": [2, 28], "42caj4e6bk1f5b1j": 26, "43387136": [2, 28], "434330506948445": 9, "434k": 9, "4453": 25, "4641016151377544": 9, "482306849763902e": 18, "49": 20, "4th": 26, "5": [0, 2, 4, 9, 16, 17, 18, 19, 20, 21, 22, 25, 26, 28, 32, 34], "50": [0, 4, 8, 11, 16, 26], "50755": 32, "50952601": [2, 28], "512": 11, "52": [8, 17], "52269109": [2, 28], "5277": 9, "538973871058276": 9, "543": 26, "56": 26, "58591403": [2, 28], "59": 26, "59434797": [2, 28], "5952": 25, "5b": 11, "5mb": 9, "5x": 21, "6": [0, 4, 9, 10, 17, 22, 26, 34], "6041965": 16, "61": 20, "62": [26, 34], "620": 26, "62977601": [2, 28], "63869202": [2, 28], "639": 9, "6399497389793396": 10, "64": [27, 34], "65": 27, "66": 26, "690kb": 9, "6960948705673218": [2, 28], "6997432112693787": [2, 28], "6h": 26, "7": [0, 4, 10, 11, 20, 26, 27], "7015200257301331": [2, 28], "702155": [2, 28], "7030453681945801": [2, 28], "705004": [2, 28], "70673105": [2, 28], "70760502": [2, 28], "711359": [2, 28], "7120099067687988": [2, 28], "7142490744590759": 10, "735": [2, 28], "7354257106781006": [2, 28], "7471904754638672": [2, 28], "7490593194961548": [2, 28], "775945782661438": [2, 28], "78861002": [2, 28], "7th": 26, "8": [12, 17, 21, 25, 26, 27], "80508": 9, "8173": 32, "82": 21, "8206598162651062": [2, 28], "8314": 32, "8657019734382629": 10, "87": 21, "875": 17, "8888": 20, "89": 27, "8bit": [6, 11], "9": [11, 18, 19, 20, 25, 26, 27], "914392": [2, 28], "9227": 32, "92500597": [2, 28], "9387": 32, "95": [11, 20, 21, 26], "980": 26, "993": 32, "9985288301111273": 9, "A": [1, 7, 8, 9, 13, 14, 17, 18, 19, 21, 23, 24, 25, 26, 33, 34], "AND": 26, "AS": 35, "And": 18, "As": 33, "BE": 26, "By": [9, 18, 19, 22, 26, 29, 34], "For": [0, 4, 9, 19, 20, 21, 26, 27, 33, 34, 35], "If": [1, 2, 9, 15, 18, 19, 20, 21, 22, 24, 26, 27, 28, 30, 34], "In": [17, 21, 26, 30, 33], "It": [1, 2, 5, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27, 28, 33, 34], "No": [1, 9, 15, 20, 24, 33], "OF": 35, "OR": 35, "Of": [20, 32], "One": [1, 24], "The": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33], "Then": [2, 18, 19, 28], "These": [0, 2, 4, 5, 9, 26, 28], "To": [0, 4, 5, 8, 11, 15, 20], "With": [2, 9, 28, 33, 34], "_": [15, 16, 26], "_73bcj049dzbu9t49b4va170k": 21, "__annotations__": [12, 18], "__dict__": [12, 18], "__doc__": [12, 18], "__init__": [0, 1, 2, 4, 6, 7, 10, 11, 12, 16, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28], "__module__": [12, 18], "__weakref__": [12, 18], "_d": 26, "_depth": 9, "_distanc": 9, "_is_thai_and_not_num": 18, "_path": 9, "_similar": 9, "_token": 21, "_\u0e40\u0e27\u0e2d\u0e23": 9, "aa": 26, "ab": 27, "abbr_en": 9, "abbr_th": 9, "abbrevi": [9, 20, 26, 34], "abbreviation_to_full_text": [26, 34], "abcbdab": 26, "abl": 9, "about": [8, 9, 10, 15, 20, 26, 31, 34, 35], "abov": 26, "above_vowel": 26, "abstract": 9, "ac": [17, 18, 20], "academ": 20, "access": [1, 9, 21, 22, 23, 24], "accident": 26, "accommod": 34, "accomplish": 9, "accord": [9, 12, 17, 18, 21, 26], "account": [20, 26], "accur": [1, 5, 21, 23, 24, 26, 34], "accuraci": [5, 18, 21, 25, 29], "achiev": 29, "acl": [15, 21], "aclweb": [2, 21, 28], "acoust": 20, "act": 23, "activ": [9, 20], "actual": 26, "ad": [21, 25, 26], "adapt": 21, "add": [2, 21, 25, 26, 28], "add_special_cas": 25, "add_year": 26, "addit": [1, 5, 21, 24], "addition": [1, 24], "address": 21, "adher": [12, 26], "adj": [9, 20], "adject": [9, 20], "adp": 20, "adposit": 20, "adv": 20, "advanc": [17, 18, 21, 23, 34], "adverb": [9, 20], "adverbi": 20, "advi": 20, "advis": 35, "advn": 20, "advp": 20, "ae": 26, "aek": 12, "affirm": 20, "after": [9, 20, 21, 25, 26], "against": 32, "agg": 25, "aggreg": [2, 25, 28], "agre": 35, "ah": 26, "ai": [6, 21, 23, 25, 27], "aibuildersx": 23, "aid": [14, 26], "airesearch": [19, 23], "aj": 20, "aksonhan": 3, "aksonhan_to_curr": 3, "al": [7, 9, 21, 27], "algorithm": [14, 17, 18, 19, 20, 21], "alik": 17, "all": [2, 9, 12, 16, 18, 19, 21, 23, 25, 26, 28, 34, 35], "allow": [0, 1, 4, 5, 9, 11, 17, 20, 21, 24, 25, 26, 27], "almost": 26, "along": 21, "alpha": 25, "alphabet": [1, 17, 24, 26, 34], "alreadi": 9, "also": [1, 9, 10, 19, 21, 24, 25, 34], "altern": [0, 1, 4, 24], "alwai": 26, "am": 26, "ambigu": [21, 29], "amd64": 34, "among": 21, "amount": [16, 26], "amp": 25, "an": [1, 3, 5, 9, 10, 14, 16, 18, 19, 21, 24, 25, 26, 29, 33, 35], "analysi": [5, 12, 14, 21, 26, 27], "analyz": [8, 12, 15, 17], "anbsp": 25, "ancestor": 9, "ancient": 30, "ani": [2, 9, 18, 20, 21, 28, 35], "anno": 26, "annot": 20, "annyeonghaseyo": [1, 24], "answer": [6, 10, 11, 15], "anthologi": [2, 21, 28], "anyon": 5, "apach": [30, 35], "aphasia": 21, "api": 9, "appear": [19, 22], "appl": 26, "appli": [1, 8, 15, 21, 24, 25, 26], "applic": [0, 2, 4, 5, 10, 11, 17, 28, 29, 35], "approach": [0, 1, 4, 17, 21, 24, 34], "appropri": 11, "approxim": [17, 26], "ar": [0, 1, 2, 4, 5, 9, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 34, 35], "arab": 26, "arabic_digit_to_thai_digit": 26, "arb": 9, "architectur": 34, "archiv": 26, "arguabl": 33, "argument": [9, 25], "around": [18, 25], "arrai": [2, 25, 28], "art": [18, 21], "arthit": 30, "articl": 20, "artifact": [16, 25], "artifici": [9, 23], "arxiv": [20, 27], "ascend": 26, "asia": 26, "asian": 33, "ask": [6, 31], "assert": 14, "assess": 5, "asset": [0, 4], "assign": [18, 21], "assist": [1, 5, 9, 23, 24, 26, 29], "associ": [10, 16, 20, 27], "astrologi": 17, "att": [19, 27], "attacut": [21, 32, 34], "attacuttoken": 21, "attribut": [12, 18, 20, 21, 35], "audienc": [1, 23, 24], "audio": 17, "aug": [0, 4, 16], "augment": [16, 30, 34], "author": 21, "automat": 34, "aux": 20, "auxiliari": 20, "av": 20, "avail": [0, 4, 9, 17, 20, 21, 23, 25, 27, 32], "averag": [19, 20, 25], "avocado": 14, "avoid": 21, "awar": 29, "awd_lstm": 25, "ax": 20, "a\u02d0": [1, 17, 24], "b": [20, 25, 26], "backward": 21, "bact": 9, "badli": 9, "bag": [16, 25], "baht": 26, "bahttext": 26, "balanc": 21, "bangkok": 26, "bangkokhealth": 20, "bank": 26, "base": [0, 1, 4, 9, 10, 11, 14, 15, 16, 17, 18, 19, 21, 24, 25, 26, 27, 29], "base_word": 9, "baselin": 21, "basetoken": 25, "basi": [33, 35], "basic": [12, 15, 17, 18, 27], "bdab": 26, "bdcab": 26, "becaus": [26, 33], "becom": 25, "been": 9, "befor": [2, 5, 12, 20, 21, 25, 26, 28, 34], "begin": [11, 20, 26], "behavior": 26, "beij": 21, "being": 26, "bela": 10, "belong": [20, 26], "below": [9, 26], "below_vowel": 26, "benchmark": [14, 27, 30, 32, 34], "benefici": [18, 26], "bert": [15, 19, 21], "best": 33, "beta": 25, "better": 34, "between": [2, 9, 12, 15, 17, 20, 21, 23, 25, 26, 28], "bfloat16": [6, 11], "bheinzerl": [0, 4], "biggest": 33, "bigram": [9, 19], "bill": 8, "binari": [0, 4], "bind": 21, "bit": 34, "blackboard": 20, "blackboard_ud": 20, "bleu": 23, "bmsacr": 9, "boil": 33, "bool": [0, 1, 2, 4, 5, 6, 9, 11, 12, 14, 16, 18, 19, 20, 21, 23, 24, 26, 27, 28], "boolean": [12, 18], "boonkwan": 20, "boriboon": 20, "both": [2, 12, 17, 21, 26, 28, 33], "bound": 25, "boundari": 21, "bpemb": [0, 4], "bpemb_wv": [0, 4], "bracket": [16, 25], "break": [21, 25, 27], "breakiter": [9, 21], "bridg": 23, "broken": 21, "br\u00fcckner": 17, "bsd": 26, "buddhist": 26, "build": [20, 29, 34], "builder": 23, "built": [16, 27, 34], "bul": 9, "bunch": 25, "bundl": 9, "c": [9, 21, 26, 27, 34], "c16": 21, "calcul": [2, 9, 13, 26, 28], "calculate_ngram_count": 13, "calendar": 26, "call": [11, 17, 19, 26, 34], "callabl": [9, 16, 18, 25], "can": [2, 5, 8, 9, 10, 12, 15, 17, 18, 19, 21, 22, 25, 26, 28, 29, 30, 31, 32, 33, 34], "candid": 18, "canin": 21, "cannot": [14, 26], "capabl": [0, 4, 11, 15, 17, 23, 27], "capit": 25, "captur": 33, "cardin": 20, "care": [19, 26], "case": [1, 18, 21, 24, 25], "cat": [9, 23, 26], "catalog": [9, 32], "categor": 14, "caus": 26, "cc": [0, 4, 9, 20, 35], "cc0": 35, "cconj": 20, "cell": 21, "central": 23, "centuri": 26, "certainli": 26, "cfqc": 20, "ch": 26, "chakri": 26, "challeng": [8, 33], "chanc": 21, "chang": [21, 26], "chaovavanich": [9, 21, 30], "chapter": 26, "char": [15, 22, 26], "char_level": 32, "charact": [1, 5, 9, 12, 16, 18, 21, 24, 25, 26], "characterist": 26, "charin": 30, "charun": 20, "chase": 9, "chat": 30, "chatbot": [6, 11], "chatbotmodel": 6, "check": [12, 14, 18, 26], "check_aek_too": 12, "check_karu_lahu": 12, "check_klon": 12, "check_marttra": 12, "check_sara": 12, "checker": [12, 18], "children": 26, "chines": [23, 33], "chodorow": 9, "choic": [18, 21], "choos": [1, 18, 19, 20, 21, 24, 26], "chormai": 30, "chosen": 11, "chula": [17, 18], "chulalongkorn": 17, "chumpolsathien": 21, "chumpolsathien_2020": 21, "chunk": 20, "chunk_pars": 20, "ch\u00e0o": [1, 24], "cite": [17, 21, 27, 30], "cl": 20, "clariti": 26, "class": [1, 2, 6, 7, 10, 11, 12, 16, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28], "classic": 12, "classif": [7, 25, 26, 27, 33, 34], "classifi": [20, 30], "clean": [5, 6, 25, 26], "cleaner": 25, "clip": 25, "clir": 17, "clock": 26, "close": 26, "closer": 17, "closest": 18, "cltv": 20, "cluster": [8, 21], "clusters_str": 8, "cmn": 9, "cmtr": 20, "cnit": 20, "co": [1, 23, 24, 26], "code": [9, 16, 17, 21, 22, 23, 25, 26, 35], "coher": 11, "col": 5, "colab": [21, 27], "collat": 26, "collect": [5, 9, 14, 20, 21, 22, 25, 26], "colloc": 21, "com": [0, 4, 9, 10, 14, 16, 19, 20, 21, 23, 26, 27], "combin": [2, 17, 21, 28], "come": 9, "comma": 33, "command": [30, 34], "comment": 9, "common": [9, 20, 21, 26, 35], "commun": [9, 21, 23, 26], "compact": 25, "compar": [2, 5, 20, 28], "comparison": 17, "compat": [0, 4, 26], "compil": [1, 24, 34], "compl": 9, "complet": 19, "complex": 25, "complianc": 35, "compon": [1, 8, 10, 12, 15, 18, 21, 23, 24, 25, 26, 27, 34], "comprehend": 9, "comprehens": [1, 21, 24, 27], "compress": 17, "compressor": 7, "comput": [2, 5, 9, 19, 25, 26, 28], "compute_stat": 5, "concepnet": 9, "concept": [8, 21], "conceptu": 26, "condit": [21, 35], "config": [25, 34], "configur": 18, "conjunct": 20, "conll": 15, "conllu": 15, "connect": 9, "connector": 33, "consecut": [11, 19], "consid": [9, 12, 20, 25], "consider": 9, "consin": [2, 28], "consist": [5, 9, 11, 25, 26], "conson": [3, 26], "constrain": [9, 21], "constraint": 9, "construct": 10, "contain": [0, 2, 4, 9, 20, 21, 22, 26, 28, 29], "content": [11, 23, 26], "context": [8, 11, 20, 26, 29, 33], "contigu": 21, "continu": 21, "contribut": 23, "contributor": [9, 22], "conveni": [2, 11, 28], "convent": 26, "convers": [1, 23, 24, 26], "convert": [1, 2, 3, 17, 20, 23, 24, 25, 26, 28], "convert_year": 26, "coordin": [20, 23], "cop": 15, "copi": [1, 9, 24, 35], "copyright": 35, "core": [10, 12, 15, 16, 25, 29], "coref": 30, "corefer": 34, "coreference_resolut": [8, 34], "corpora": [2, 9, 20, 25, 28, 35], "corpu": [0, 4, 11, 18, 19, 20, 21, 23, 26, 27, 30, 32, 34, 35], "correct": [5, 12, 21, 22, 26, 29], "correctli": [5, 18, 26], "correctly_tokenised_word": 32, "corrector": [18, 26], "correspond": [5, 10, 12, 20, 26], "cos_sim": 29, "cosin": [2, 28], "could": [9, 21, 26, 34], "count": [13, 26], "count_thai_char": 26, "counter": [25, 26], "countries_th": 9, "countthai": 26, "cover": 23, "covid": 8, "cp": 17, "cp36": 34, "cp36m": 34, "cpe": 19, "cplusplu": 26, "cpu": [6, 8, 11, 27, 29, 34], "crawl": [0, 4], "creat": [11, 14, 15, 19, 20, 21, 22, 26, 33, 34, 35], "creation": [11, 12], "creativ": 35, "credit": [9, 21], "crf": [20, 21], "crfcut": [21, 32], "crfsuit": 34, "criteria": 26, "critic": [10, 25], "cross": [17, 21, 23], "crucial": [5, 8, 22, 25, 26, 29], "cstorm125": [0, 4], "ctime": 26, "cuda": [6, 8, 10, 11], "cue": 26, "cultur": [23, 26], "curl": 9, "currenc": 26, "current": [3, 5, 18, 21, 26, 34], "custom": [1, 9, 18, 21, 24, 26], "custom_dict": [18, 21, 29], "custom_dict_japanese_nam": 21, "custom_dictionari": 21, "custom_token": 29, "custom_words_list": 21, "customiz": 18, "cut": [21, 25], "cvbl": 20, "d": [9, 26], "dai": 26, "dan": 9, "dangl": 26, "data": [0, 2, 4, 5, 9, 11, 18, 22, 25, 26, 27, 28, 32, 34], "databas": [0, 4, 9, 34], "datafram": 5, "dataset": [9, 21, 22, 27, 32, 35], "dataset_nam": 27, "datatim": 26, "date": [20, 21, 26], "datetim": 26, "datetime_obj": 26, "db": 9, "dcnm": 20, "ddac": 20, "ddan": 20, "ddaq": 20, "ddbq": 20, "de": 33, "dead": [12, 26], "dead_syllable_as_aek": 12, "deal": [8, 29], "deberta": 15, "decid": 33, "decim": [21, 26], "decod": 25, "decreas": 18, "dedic": [1, 8, 12, 23, 24, 35], "deep": [1, 24, 25], "deepcut": [21, 32], "default": [1, 2, 8, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 34], "default_db": 9, "default_word_dict_tri": 21, "defaultdict": 9, "defin": [19, 21], "definit": [20, 29], "degre": 33, "delimit": [21, 26], "demonstr": 20, "deni": 34, "depa": [23, 27], "depend": [15, 20, 34], "dependency_pars": 34, "deprec": 26, "depth": 9, "deriv": 15, "descend": [18, 26], "design": [0, 1, 4, 5, 8, 9, 11, 12, 17, 18, 21, 22, 23, 24, 25, 26, 27, 29], "det": 20, "detail": [0, 2, 4, 9, 14, 20, 27, 28, 32, 34], "detect": [14, 15, 21, 26], "detector": 26, "determin": [20, 26, 29], "detoken": 21, "dev": 9, "develop": [0, 4, 5, 17, 21, 22, 26, 34], "devic": [6, 8, 9, 10, 11, 29], "dhanon": 20, "diac": 20, "diacrit": [1, 24, 26], "diaq": 20, "dibq": 20, "dict": [1, 5, 8, 9, 10, 11, 13, 18, 20, 21, 24, 25, 26, 29], "dict_filt": 18, "dict_item": 18, "dict_nam": 21, "dict_sourc": [21, 26], "dict_tri": [9, 21, 26], "dictionari": [1, 9, 13, 18, 19, 21, 24, 25, 26, 29], "die": 26, "differ": [5, 8, 19, 20, 21, 23, 33], "difficult": 9, "digit": 26, "digit_to_text": 26, "dim": [0, 4], "dimens": [2, 25, 28], "direct": [23, 26], "directli": [2, 22, 28, 32, 34], "directori": [9, 22, 34], "disambigu": [9, 29], "discard": 9, "discours": 20, "discuss": [2, 28], "displai": [21, 26], "display_cell_token": 21, "display_thai_char": 26, "dist": [9, 22], "distanc": [9, 18, 29], "distil": 21, "distribut": [26, 34, 35], "divers": [0, 4, 16], "diversif": [0, 4], "diversifi": [0, 4], "divid": 21, "dl": 9, "do": [1, 9, 11, 16, 24, 25, 26, 33], "doc": [0, 4, 19, 21, 25, 26], "document": [0, 2, 4, 5, 9, 19, 21, 22, 25, 26, 28, 35], "document_vector": 25, "document_vectorr": 25, "doe": [16, 20, 22, 25, 26, 33, 34], "doesn": [9, 20], "doesnt_match": [2, 28], "dog": 9, "domain": [21, 26, 35], "domini": 26, "don": 25, "done": 25, "donm": 20, "doubl": 26, "down": [3, 25, 26, 27, 33], "download": [0, 4, 22, 23, 32, 34], "download_model_al": 23, "downstream": 5, "drill": 9, "drive": 21, "drop_mult": 25, "dt_obj": 26, "du": 23, "due": [21, 26], "duplex_hous": 9, "duplic": [9, 11, 26], "dwell": 9, "dynasti": 26, "e": [9, 19, 20, 21, 26], "each": [0, 2, 4, 9, 18, 19, 20, 21, 23, 26, 28, 34], "eaff": 20, "eas": 26, "easi": [9, 19, 33], "easier": [25, 26], "ed": 26, "edg": 25, "edit": [9, 18], "edu": [9, 34], "effect": [1, 5, 8, 9, 19, 23, 24, 25], "effici": [21, 22, 25, 26], "effort": [9, 34], "either": [2, 28, 35], "eitt": 20, "el": [30, 34], "el_scor": 10, "electr": [2, 28], "electron": 26, "elimin": [25, 26], "ell": 9, "em_sz": 25, "email": [20, 21], "emb": [19, 25], "emb_sz": 25, "embed": [0, 4, 19, 21, 25], "embed_p": 25, "emoji": [25, 26], "emoji_to_thai": 26, "emojipedia": 25, "empir": 30, "empti": [9, 16, 25, 26, 29], "en": [0, 4, 9, 11, 23, 26], "en_th": 23, "enabl": [0, 4, 11, 17, 22, 23], "encapsul": [2, 21, 28], "encod": [17, 21, 25, 26], "encoder_dp": 25, "encompass": 22, "end": [9, 20, 21, 26, 33], "endur": 9, "eng": [9, 17], "eng_to_thai": 26, "engin": [15, 17, 18, 23, 25, 33, 34], "english": [0, 1, 4, 9, 12, 14, 17, 23, 24, 25, 26], "enhanc": [0, 2, 4, 18, 21, 23, 26, 28, 29], "enrich": [0, 4], "ensembl": 21, "ensur": [1, 2, 12, 21, 22, 23, 24, 25, 26, 28, 29], "enth": 23, "enthtransl": 23, "entir": [14, 18], "entiti": [8, 9, 10, 16, 20, 21, 27, 34], "entity_typ": 20, "entropi": 21, "environ": [2, 22, 28, 34], "eo": 33, "epitran": [1, 24], "equal": [1, 20, 24], "equat": 9, "equip": 12, "equival": 26, "era": 26, "errno": [9, 34], "error": 18, "esolut": 34, "especi": 26, "essenti": [2, 5, 9, 10, 21, 22, 26, 27, 28], "esupar": [15, 34], "et": [7, 21, 27], "etc": [33, 34], "etcc": 21, "eu": 9, "even": 33, "everi": [18, 26], "everyth": 34, "exact": [9, 20], "exactli": 18, "exampl": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 32, 33, 34], "excel": 26, "except": 35, "exclud": [20, 26], "exclude_stopword": 26, "executor": 34, "exercis": 9, "exist": [9, 18, 21, 22, 25, 34], "expand": [18, 23, 26], "expand_maiyamok": 26, "expans": 16, "expens": 19, "explicit": [2, 28], "explicitli": [21, 22], "explor": [0, 4, 27], "exponenti": 21, "expos": 22, "express": [8, 20, 33, 35], "extend": [11, 18], "extens": [18, 26], "extern": 22, "extra": 34, "extra1": 34, "extra2": 34, "extra_id_0": 19, "extract": [21, 26, 27], "extract_featur": 21, "extract_keyword": 19, "extran": 25, "extrem": 33, "f": [12, 21, 26], "f1": 5, "fa": 9, "face": 33, "facebook": [9, 21, 23], "facilit": [0, 4, 5, 11, 23, 26], "fail": [18, 34], "fall": 26, "fallback_engin": [1, 24], "fals": [1, 2, 5, 6, 9, 11, 12, 14, 16, 19, 20, 21, 23, 24, 25, 26, 27, 28, 34], "famili": 9, "familiar": [1, 24], "family_names_th": 9, "faq": 30, "fast": [21, 25, 26, 34], "fastai": [16, 25], "faster": 21, "fasttext": [0, 4], "father": 9, "featur": [1, 17, 21, 24, 25, 26], "feb": 27, "fed": 21, "femal": 9, "field": [21, 26, 33], "file": [0, 4, 9, 21, 25, 26, 32, 34, 35], "file_path": 21, "fileid": 21, "filenam": 9, "filenotfounderror": 9, "filter": [16, 18, 19, 21, 25], "fin": 9, "final": 19, "financi": 26, "find": [0, 2, 4, 7, 9, 12, 17, 18, 21, 25, 26, 28, 34], "find_all_seg": 21, "find_keyword": 26, "find_synonym": [0, 4], "fine": [21, 25, 27], "first": [2, 9, 18, 19, 26, 28], "fit": 26, "five": 26, "fix": [26, 34], "fix_html": 25, "fixn": 20, "fixv": 20, "flexibl": [1, 24], "float": [0, 2, 4, 5, 9, 11, 17, 18, 19, 21, 22, 26, 28, 29], "float16": [6, 11], "fmt": 26, "fn": 32, "focu": 21, "focus": [1, 23, 24, 25, 26], "folder": [6, 11], "follow": [0, 4, 5, 8, 11, 15, 20, 21, 26, 27, 30, 33], "follow_vowel": 26, "food": [2, 28], "forc": 9, "form": [1, 2, 9, 12, 20, 24, 26, 28], "format": [1, 9, 14, 16, 20, 21, 24, 26, 27], "forward": 21, "found": [9, 18, 26], "foundat": [2, 27, 28], "four": 26, "fp": 32, "fr": 23, "fra": 9, "framework": [5, 21], "free": [7, 26], "freebsd": 26, "french": [23, 33], "freq": 18, "frequenc": [9, 11, 18, 19, 20, 26], "frequent": 31, "friendli": 26, "from": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34], "front": [25, 26], "frozen": 25, "frozenset": 9, "full": [22, 26, 34], "func": [18, 19, 25], "function": [1, 2, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 34], "fundament": [1, 5, 15, 18, 21, 24, 26, 27], "further": [0, 4, 9, 21, 27], "futur": 5, "fx": 20, "g": [9, 19, 21], "g2p": [1, 24, 32], "gap": 23, "gate": 8, "gen": 11, "gen_instruct": 11, "gener": [0, 1, 2, 4, 6, 9, 16, 18, 21, 24, 26, 28, 30, 34], "gensim": [2, 28], "get": [2, 9, 10, 21, 25, 28, 29, 30, 32, 34], "get_el": 10, "get_full_data_path": 22, "get_model": [2, 28], "get_ner": [16, 20, 27], "get_pythainlp_data_path": 22, "get_pythainlp_path": 22, "get_sens": 29, "get_tag": 16, "gist": 21, "github": [0, 1, 4, 10, 15, 16, 19, 21, 23, 24, 26, 27, 30, 31], "give": 26, "given": [2, 9, 15, 16, 17, 18, 19, 20, 22, 26, 28, 29], "glg": 9, "global": [1, 24, 26], "gnu": 26, "go": 16, "god": 9, "godhead": 9, "goeswith": 15, "gohlk": 34, "goldberg": [2, 28], "good": 34, "googl": [21, 27], "got": 26, "govern": [22, 35], "gpu": [23, 27], "gram": [11, 13, 21], "grammar": 21, "grammarli": 18, "grammat": 15, "graph": [9, 10, 21], "graphem": [1, 24], "great": [9, 21], "greater": 19, "greet": 9, "gregorian": 26, "ground": 5, "group": [9, 23, 27], "grouped_ent": 27, "guid": 26, "guidelin": [0, 4, 5, 20], "gzipmodel": 7, "h": [1, 24, 26], "ha": [9, 14, 20, 26, 33], "habit": 33, "han": [8, 21], "han_solo": 21, "handbook": 26, "handl": [12, 21, 22, 25, 26], "handle_karun_sound_sil": 12, "hanyu": [1, 24], "hao3": [1, 24], "hard": [9, 33], "have": [2, 9, 25, 26, 28, 33, 34], "head": 15, "heb": 9, "hei": 16, "hejira": 26, "hello": 9, "help": [8, 9, 15, 21, 25, 26, 32], "helsinki": 23, "hepburn": [1, 24], "here": [8, 10, 11, 12, 15, 22], "heurist": [14, 21], "hidden": [19, 26], "hidden_p": 25, "high": 26, "highest": [18, 19], "highli": 19, "highlight": [1, 24], "histor": [26, 33], "histori": 6, "historian": 26, "hitoshi": 20, "home": [9, 34], "homonym": 17, "hood": 19, "hope": 9, "hour": 26, "hous": 9, "how": [1, 8, 10, 11, 12, 15, 18, 24, 25, 34], "hrv": 9, "hss": 9, "html": [0, 4, 16, 20, 25, 26, 27], "http": [0, 1, 2, 4, 9, 10, 14, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 34, 35], "huggingfac": [1, 15, 23, 24], "human": 26, "hunspel": 18, "hypernym": 9, "hyponym": 9, "h\u0101phyntr": [1, 24], "i": [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35], "icu": [1, 9, 21, 24, 33, 34], "icu_vers": 34, "icubrk_th": 9, "id": 9, "idea": 29, "ident": 9, "identif": [14, 26], "identifi": [8, 14, 15, 18, 21, 22, 26, 27], "idn": 26, "idna": 26, "ignor": [19, 26], "ignore_char": 26, "ij": 20, "illustr": 9, "impact": 5, "imper": 20, "implement": [7, 17, 18, 19, 21, 26], "impli": 35, "import": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 33], "improv": [0, 4, 5, 9, 18, 21, 26], "in_dict": 21, "includ": [0, 1, 4, 5, 9, 10, 11, 15, 16, 19, 20, 21, 24, 25, 26, 27, 33, 35], "incorpor": 5, "incorrect": 26, "incorrectli": 26, "incur": 18, "ind": 9, "indefinit": 20, "independ": 26, "index": 30, "indic": [1, 5, 9, 18, 20, 24, 26, 33], "indiv_char": 21, "individu": [21, 25, 26], "info": [20, 32], "inform": [8, 9, 10, 15, 17, 21, 25, 26, 27, 34, 35], "initi": [11, 12, 18, 21, 25], "inner": 22, "input": [1, 2, 8, 9, 11, 12, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 28, 32], "input_d": 26, "input_fil": 32, "input_p": 25, "inrut": 21, "insepar": 21, "insert": 25, "insid": 20, "insight": 22, "instal": [2, 28, 30], "instanc": [8, 12, 18, 19, 20, 26], "instanti": 21, "instantiatetd": 26, "instead": [9, 25, 26], "institut": [1, 21, 23, 24, 27], "instruct": 11, "instruct_gener": 11, "int": [0, 4, 7, 9, 11, 12, 13, 16, 17, 18, 19, 20, 21, 25, 26], "integ": 26, "integr": [0, 4], "intellig": [9, 23], "intend": [22, 26, 27], "intention": 26, "interest": 5, "interfac": [2, 23, 28], "interim": 26, "interject": 20, "intermedi": 20, "intern": [1, 21, 22, 24, 26, 34, 35], "internation": 26, "internet": [9, 27], "interpret": [9, 29], "interrog": 20, "intersect": 18, "intj": 20, "invalu": [1, 5, 24, 26], "invis": 26, "involv": [15, 29], "io": 30, "iob": [16, 20, 27], "ip": 21, "ipa": [1, 17, 24, 26, 34], "ipa_to_rtg": 26, "is_exclud": 11, "is_khave": 12, "is_native_thai": 14, "is_poetri": 12, "is_sumpu": 12, "isa": 9, "isahara": 20, "iscit": 21, "ismiddlecut": 21, "iso": [1, 9, 24], "iso_11940": [1, 24], "issu": [1, 24, 33], "isthai": 26, "isthaichar": 26, "ita": 9, "item": 26, "itemsview": 18, "iter": [9, 18, 19, 20, 21, 26], "itos_new": 25, "itos_pr": 25, "its": [2, 12, 22, 25, 26, 27, 28, 29], "itself": 18, "ix": 26, "j": [1, 24, 26], "ja": 9, "jakkrit": 21, "jan": 27, "jantrakulchai": 27, "japan": [9, 20], "japanes": [1, 24, 33], "javascript": 26, "jcmp": 20, "jcrg": 20, "jeeragon": 21, "jiang": 7, "jitkrittum": 21, "join": [21, 22], "join_broken_num": 21, "joiner": [25, 26], "journal": [1, 20, 24], "jp": [1, 24], "jp_input": [1, 24], "jpn": 9, "jsbr": 20, "json": 9, "jupyt": 25, "just": [18, 26], "k": [7, 11, 19, 26], "k_type": 12, "kai": 26, "kanchanawan": [1, 24], "kanyanat": 20, "karun": 12, "kb": 19, "kedmane": 26, "keep": [9, 18, 21, 22], "keep_whitespac": 21, "kei": [9, 10, 13, 18, 26], "keyboard": 26, "keyedvector": [2, 28], "keyerror": [2, 28], "keyphras": 19, "keyphrase_ngram_rang": 19, "keyword": [21, 26], "khamyo": 26, "khave": 30, "kind": [9, 21, 35], "king": [17, 26], "kl": 26, "kluai": 26, "kluaj": 26, "kmutt": 19, "knowledg": [9, 10, 21], "known": [1, 10, 18, 24, 33], "ko": [1, 24, 26], "koichiyasuoka": 15, "korakot": [9, 21, 30], "korean": [1, 24], "kosawat": 20, "kriengket": 20, "krit": 20, "kucut": 9, "kv": 12, "l": [26, 27], "label": [2, 7, 9, 16, 20, 28, 32], "lack": 33, "lalita": [23, 30], "lalitadeelert": 23, "lambda": 9, "lang": [0, 1, 4, 24, 25, 26], "languag": [0, 1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 35], "language_model_learn": 25, "larg": [0, 2, 4, 19, 21, 25, 28], "last": 19, "lastli": 23, "latin": [1, 24], "latticestr": 21, "law": [20, 35], "layer": 19, "layout": 26, "la\u02d0": 26, "lc": 26, "lc_time": 26, "lch": 9, "leacock": 9, "lead": [0, 4], "lead_vowel": 26, "learn": [1, 14, 21, 24, 25], "learner": 25, "learnt": 34, "least": 9, "leenoi": 20, "len": [5, 9, 20], "length": [10, 17, 18, 26], "leopard": [2, 28], "less": [19, 33], "letter": [1, 24, 33], "level": [5, 15, 16, 25, 26], "leverag": 19, "levi": [2, 28], "lexic": [0, 4, 9], "lfd": 34, "lib": [9, 22], "libc": 26, "librari": [2, 10, 21, 22, 23, 26, 28, 30, 33, 34], "licens": [9, 30], "light": 22, "like": [9, 16, 17, 18, 20, 22, 27, 33, 34], "limit": [21, 35], "limkonchotiwat": 30, "limmaneepraserth": 21, "line": [9, 26, 30, 34], "ling": 18, "lingual": 21, "linguist": [2, 20, 26, 28], "link": [8, 10, 21, 26], "linked_ent": 10, "linux": 26, "list": [0, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 27, 28, 29, 33], "list_neg": [2, 28], "list_posit": [2, 28], "list_text": 10, "list_to_str": 21, "list_word": [13, 17, 18], "live": 26, "lk82": 32, "ll": 25, "lm": [0, 4, 16, 30], "load": [0, 2, 4, 6, 7, 11, 21, 23, 25, 28], "load_data": 25, "load_dict": 21, "load_engin": 20, "load_in_8bit": [6, 11], "load_model": [6, 11, 23], "load_w2v": [0, 4], "load_wordvector": [2, 28], "local": [9, 21, 22, 26, 32], "locat": [16, 20, 21, 22, 26, 27, 34], "log": 9, "logic": 26, "long": [21, 26], "longest": [9, 21, 26, 32], "longest_common_subsequ": 26, "longestmatchtoken": 21, "look": [1, 18, 24, 34], "lookup": [1, 24, 26], "lorchirachoonkul": 17, "lot": 33, "love": 23, "low": [6, 7, 11, 26], "low_cpu_mem_usag": [6, 11], "lowercas": 25, "lowercase_al": 25, "lowphansirikul": [27, 30], "lowphansirikul_2021": 27, "lst20": 20, "ltw2v": [0, 2, 4, 28], "ltw2v_v1": [2, 28], "luantangsrisuk": 20, "lunar": 26, "lunarlist": 16, "m": [1, 17, 24, 26, 27], "m6h": 26, "maartengr": 19, "maat2": [1, 24], "machin": [23, 25, 34], "maco": 26, "macron": [1, 24], "mai": [0, 4, 19, 20, 21, 22, 26, 34, 35], "main": 21, "mainli": 22, "maintain": 30, "maiyamok": 26, "maj": 26, "make": [0, 1, 2, 4, 5, 9, 17, 18, 20, 23, 24, 25, 26, 28, 33], "male": 9, "man": [16, 26], "man3": 26, "manag": [17, 21, 22, 25, 26, 32], "mandarin": [1, 24], "mani": [21, 26], "manner": 26, "manpages_iphoneo": 26, "manual": 26, "map": [0, 2, 4, 9, 20, 28], "mappingproxi": [12, 18], "marianmt": 23, "mark": [16, 20, 21, 26, 33], "master": 17, "mastersthesi": 21, "match": [17, 20, 21, 26], "max_keyword": 19, "max_len": 18, "max_length": 16, "max_n_gram": 21, "max_new_token": 11, "max_syn_s": [0, 4], "maxim": 21, "maximum": [0, 4, 9, 11, 13, 18, 19, 21], "ma\u02d0t3": [1, 24], "md_score": 10, "meal": [2, 28], "mean": [2, 9, 15, 19, 25, 26, 28, 29, 34], "meaning": 23, "meant": 26, "measur": [9, 20, 29], "mechan": [21, 22], "media": 21, "mem": [6, 11], "member": 9, "memori": 21, "mental": 9, "mention": 10, "merg": 25, "merge_wgt": 25, "metacpan": 26, "metaphon": 17, "metasound": 32, "meth": 25, "method": [0, 1, 2, 4, 5, 7, 11, 12, 14, 18, 21, 23, 24, 25, 28, 30], "metric": [5, 18], "microsecond": 26, "microsoft": [26, 34], "mid": 26, "middle_cut": 21, "min_df": 19, "min_freq": 18, "min_len": [18, 26], "mini": 21, "minim": 19, "minimum": [13, 18, 19, 26], "minut": 26, "miscellan": 22, "misspel": [22, 32], "mistak": 26, "mix": [9, 21], "ml": 34, "mm": 21, "mmcut": 21, "mode": [21, 34], "model": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 15, 16, 19, 20, 21, 23, 24, 25, 27, 28, 29, 34, 35], "model_nam": [2, 6, 8, 10, 19, 28], "model_path": [0, 4, 7, 11, 25], "modif": [9, 26], "modifi": [25, 26], "modify_s": [0, 4], "modul": [0, 4, 5, 8, 10, 14, 30], "monei": [20, 26], "monetari": 26, "month": 26, "monthika": 20, "mood": 20, "more": [1, 2, 9, 11, 14, 16, 18, 21, 24, 25, 26, 28, 34, 35], "more_word": 9, "morphem": 30, "mors": 26, "morse_decod": 26, "morse_encod": 26, "morse_text": 26, "most": [2, 9, 18, 19, 21, 28, 29, 33], "most_similar": [2, 28], "most_similar_cosmul": [2, 28], "mostli": [2, 28], "mpnet": 29, "mt": 23, "mt5": [19, 34], "multi": [21, 33], "multi_cut": 21, "multiel": 10, "multilingu": [9, 23, 29], "multipl": [0, 1, 2, 4, 16, 21, 24, 25, 26, 28], "multithread": 21, "myz7nzar7dmw": 21, "n": [1, 9, 11, 13, 17, 19, 21, 24, 26, 27], "n_gram": 21, "n_hid": 25, "n_layer": 25, "n_max": 13, "n_min": 13, "n_sent": [0, 4], "nakhun": 21, "nakhunchumpolsathien": 21, "name": [0, 2, 4, 6, 9, 10, 11, 12, 15, 16, 17, 19, 20, 21, 22, 25, 26, 27, 28, 34], "name_en": 9, "name_synset": 9, "name_th": 9, "named_ent": [20, 21], "namedentityrecognit": 27, "namedentitytagg": 16, "naoto": 20, "naphasia": 21, "nation": [9, 11, 18, 20], "nativ": [14, 20, 26, 33], "natur": [2, 5, 8, 9, 10, 15, 21, 23, 26, 27, 28, 29, 30, 33], "navig": 9, "nbi": 9, "nbsp": 25, "ncmn": [15, 20], "ncnm": 20, "ncsec98": 17, "ndarrai": [2, 19, 25, 28], "ne": 20, "necessari": [11, 12], "nectec": 26, "nectec_to_ipa": 26, "need": [1, 2, 9, 16, 21, 24, 25, 28, 29, 34], "neg": [2, 5, 7, 20, 28], "negat": [9, 20], "negations_th": 9, "nel": 10, "ner": [16, 20, 21, 27], "nercut": 21, "nest": 20, "net": 26, "network": 9, "neural": 21, "neutral": 7, "new": [0, 4, 11, 14, 21, 25, 26, 33, 34], "newlin": [16, 21, 25, 26], "newmm": [9, 19, 21, 25, 32, 33], "next": [9, 11, 18], "ngram": 11, "ngzxj15rkwjnwozlot32fqborbx": 21, "ni3": [1, 24], "niggahita": 14, "nighit": 14, "nikhahit": 26, "nitaya": [1, 24], "nitsuwat": 21, "nlbl": 20, "nld": 9, "nlp": [0, 2, 4, 5, 8, 21, 23, 28, 29, 30, 33], "nlpo3": [21, 34], "nltk": 9, "nn": 20, "nner": 20, "nno": 9, "no_repeat_ngram_s": 11, "nob": 9, "node": [9, 26, 34], "nomin": 20, "non": [1, 14, 18, 24, 26], "non_thai": 26, "none": [0, 1, 2, 4, 7, 9, 11, 12, 15, 16, 18, 19, 20, 21, 23, 24, 26, 27, 28], "nonm": 20, "nonthaburi": 9, "normal": [19, 20, 25, 26], "norvig": 18, "note": [2, 5, 19, 20, 21, 25, 26, 28, 34], "notebook": [21, 25, 27, 30, 35], "noth": 26, "noun": [9, 15, 16, 20], "novel": 17, "now": [6, 20, 26], "now_reign_year": 26, "np": 20, "nprp": 20, "nrpsc": 20, "nsubj": 15, "nttl": 20, "ntu": 9, "nu": 20, "nuanc": 25, "num": 20, "num_arg": 16, "num_aug": 16, "num_replace_token": [0, 4], "num_to_thaiword": 26, "number": [0, 4, 11, 18, 19, 20, 22, 25, 26], "numer": [20, 21, 25, 26], "numpi": [2, 25, 28], "nutanong": 27, "nw": 32, "n\u0e1b\u0e01\u0e15": 21, "n\u0e1c": 21, "n\u0e41\u0e25\u0e30\u0e44\u0e14": 21, "o": [1, 20, 24], "obj": [9, 15], "object": [0, 2, 4, 9, 12, 18, 19, 21, 25, 26, 28, 29], "obtain": [2, 17, 22, 28, 35], "obvious": 34, "occup": [2, 28], "occurr": [18, 25, 26], "offer": [0, 1, 4, 5, 11, 12, 15, 17, 21, 22, 23, 24, 27], "offici": [0, 1, 4, 5, 24], "offload": [6, 11], "offload_fold": [6, 11], "offset": 10, "often": [25, 33], "oh": 16, "oh_no": 16, "ohay": [1, 24], "ohay\u014d": [1, 24], "omc": 9, "omer": [2, 28], "omerlevy_yoavgoldberg_2014": [2, 28], "omit": 21, "omw": 9, "one": [2, 16, 18, 19, 21, 26, 28, 34], "onli": [1, 2, 6, 9, 11, 17, 19, 20, 21, 24, 26, 28, 34], "onnx": [1, 24], "ontologi": 17, "open": [0, 4, 9, 21, 26, 30], "openbsd": 26, "oper": [2, 21, 22, 26, 28], "optim": [21, 34], "option": [1, 2, 8, 9, 15, 17, 19, 20, 21, 23, 24, 26, 28, 34], "opu": 23, "orchid": [0, 4, 20, 21], "orchid_ud": 20, "orchidpp": 20, "order": [9, 18, 20, 26], "ordin": 20, "org": [2, 9, 20, 21, 25, 26, 27, 28, 35], "organ": [2, 5, 20, 21, 27, 28], "orig_word": 9, "origin": [0, 4, 21, 26], "ors61": [1, 24], "oscar": 11, "oskut": [21, 34], "oss": 30, "other": [1, 2, 5, 6, 8, 9, 11, 19, 20, 24, 26, 28, 33, 35], "otherwis": [2, 9, 16, 20, 21, 26, 27, 28], "out": [1, 2, 9, 16, 19, 21, 24, 25, 26, 28, 33], "out_bia": 25, "output": [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 32], "output_fil": 32, "output_p": 25, "output_str": 11, "output_typ": 26, "outsid": 20, "over": 9, "overcom": 33, "overrid": 21, "overview": [2, 28], "p": [0, 1, 4, 11, 20, 24, 26], "pa": 20, "packag": [0, 4, 9, 22, 33, 34], "pad": [21, 26], "pad_token": 25, "padder": 21, "page": [26, 30], "pair": [23, 26], "pali": 14, "palingoon": 26, "palmer": 9, "paludkong": 21, "panda": 5, "panphon": 17, "paper": [17, 27], "para": 21, "paragraph": [12, 21], "paragraph_threshold": 21, "paragraph_token": 21, "parallel": 20, "parallel_mod": 21, "param": [9, 16, 20, 21, 22, 23, 25, 27], "paramet": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29], "paraphras": 29, "pars": [20, 26, 30, 34], "parser": 15, "parsing_result": 15, "part": [0, 4, 9, 11, 16, 20, 21, 26, 27, 32], "particl": 20, "particularli": [0, 1, 4, 17, 24, 29], "partofspeechtagg": 16, "pass": [8, 26], "path": [0, 4, 7, 9, 11, 21, 22, 26, 32, 34], "path_to_custom_dictionari": 21, "patiroop": 21, "patorn": 21, "pattarawat": 30, "pattern": 21, "pavarissi": 16, "pdf": [17, 21], "pdmn": 20, "peerat": 30, "peopl": 9, "per": 22, "percentag": 26, "perform": [0, 4, 5, 9, 15, 21, 27, 29, 33, 34], "period": 33, "permiss": [34, 35], "permissionerror": 34, "person": [16, 20, 21], "person_names_female_th": 9, "person_names_male_th": 9, "peter": 18, "phaholphinyo": 20, "phapn": [1, 24], "phapphayon": [1, 24], "phatthiyaphaibun": [21, 30], "phayathaibert": 30, "phone": [20, 21], "phonem": [1, 24, 26], "phonet": [1, 17, 21, 24, 26, 34], "php": 26, "phrase": [1, 20, 21, 24, 26], "phrombut": 20, "phunspel": [18, 34], "physic": 9, "physical_ent": 9, "pick": [2, 19, 28], "pinyin": [1, 24], "pip": 34, "pivot": 22, "pkl": 25, "place": 26, "plai": [5, 10, 22], "platform": 26, "pleas": [0, 4, 5, 9, 19, 27, 30, 35], "pn": 18, "pntr": 20, "po": [0, 4, 9, 15, 16, 20, 21, 27, 32], "pod": 26, "poem": 12, "poem_text": 12, "poetic": 12, "poetri": 12, "point": [2, 21, 26, 28], "pois": 33, "pol": 9, "polpanuma": [27, 30], "pomm": 33, "ponrawe": 21, "por": 9, "pornpimon": 26, "port": 18, "pos_tag": 20, "pos_tag_s": 20, "pos_thai_phayathai": 16, "posit": [2, 5, 7, 9, 21, 26, 28], "posix": 26, "possibl": [0, 4, 9, 18, 21, 26], "possibli": 18, "post": [9, 16, 20, 21, 23, 25], "post_rul": 25, "postag": [0, 4], "postag_corpu": [0, 4], "postype2wordnet": [0, 4], "potato": 33, "power": [0, 4, 11, 12, 18, 25, 26, 27], "pp": [20, 21], "ppr": [15, 20], "pr": 20, "prachya": 20, "practic": [1, 9, 24], "practition": [0, 2, 4, 28], "prasitjutrakul": 17, "prayut": 17, "pre": [2, 15, 18, 20, 21, 25, 28, 34], "pre_rul": [16, 25], "preced": [11, 20], "precis": [5, 21, 25, 26, 29, 32], "predict": [7, 11], "predominantli": 14, "prefer": 17, "prefix": [20, 26], "prel": 20, "prepar": [5, 25], "preposit": 20, "preprocess": [5, 16, 22, 25, 26], "present": [21, 26], "pretrain": [15, 23, 25, 27], "preval": 26, "prevent": 25, "previou": 11, "primari": [8, 12, 29], "primarili": [9, 22], "principl": 12, "print": [3, 6, 7, 8, 9, 10, 11, 12, 15, 20, 21, 26, 29], "prob": [11, 18], "probabilist": 11, "probabl": [0, 4, 11, 18], "problem": [26, 33, 34], "proceed": 30, "process": [0, 1, 2, 4, 5, 8, 9, 10, 15, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34], "process_thai": 25, "produc": 19, "project": [30, 35], "promot": 23, "pron": [15, 20], "prong": 33, "pronoun": [8, 20], "pronounc": [1, 24], "pronunci": [1, 17, 24, 26], "proofread": 18, "proper": 20, "properli": 26, "properti": 17, "propn": 20, "proport": 26, "propos": [17, 21], "prosper": 9, "provid": [0, 1, 2, 4, 5, 8, 9, 11, 12, 15, 17, 18, 21, 22, 23, 24, 25, 26, 28, 29], "provinc": 20, "pth": 9, "pu": 20, "puan": [1, 24], "public": [1, 2, 9, 24, 28, 30, 35], "publish": [1, 24], "pud": 20, "punc": 20, "punct": 20, "punctuat": [20, 26], "purpos": [21, 22, 25, 26], "put": 26, "py": 34, "pyicu": [1, 21, 24, 34], "pythainlp": [31, 32, 33, 34, 35], "pythainlp_data_dir": [22, 34], "pythainlp_read_mod": 34, "python": [18, 21, 26, 30, 33, 34], "python3": [9, 22], "pythonlib": 34, "pytorch": [1, 24], "p\u02b0": [1, 24], "p\u02b0a\u02d0pjanot": [1, 24], "q": 34, "q312": 10, "q484876": 10, "qcn": 9, "qrnn": 25, "qualit": 5, "qualiti": [0, 4], "quantifi": 26, "quantit": 20, "question": [9, 10, 15, 31], "quick": [26, 33], "qwerti": 26, "r": [9, 19, 21, 26], "r0": 32, "rais": [2, 28], "rama": 26, "randn": 25, "random": 21, "rang": [1, 9, 12, 13, 23, 24], "rank": 26, "ratio": [22, 32], "rattanakosin": 26, "raw": 26, "raw_sampl": 5, "re": [7, 19, 26], "reach": 9, "read": [9, 10, 14, 15, 31, 34], "readabl": [18, 26], "reason": 21, "recal": [5, 32], "recent": 33, "recogn": [20, 26], "recognit": [20, 21, 25, 27, 34], "recommend": [19, 34], "reconstruct": 21, "reduc": [16, 21, 25], "redund": 25, "ref_sampl": 5, "refer": [0, 4, 8, 9, 19, 21], "refin": 21, "regex": 21, "regular": [2, 28], "reign": 26, "reign_year": 26, "reign_year_to_ad": 26, "rejoin": 21, "rel": [9, 20, 26], "relat": [2, 5, 9, 22, 25, 28], "relatedto": 9, "relationship": [8, 9, 15], "releas": [23, 35], "relev": 19, "reli": [21, 26], "reliabl": [14, 18, 21], "remain": 9, "remov": [5, 13, 16, 17, 18, 25, 26], "remove_dangl": 26, "remove_dup_spac": 26, "remove_repeat_vowel": 26, "remove_repeated_ngram": 13, "remove_spac": [5, 16, 25], "remove_tone_ipa": 26, "remove_tonemark": 26, "remove_zw": 26, "render": [1, 24, 26], "reorder": 26, "reorder_vowel": [25, 26], "repeat": [11, 13, 25, 26], "repetit": [16, 19, 25, 26], "replac": [0, 4, 16, 25, 26], "replace_newlin": 16, "replace_rep_aft": [16, 25], "replace_rep_nonum": 25, "replace_spac": 16, "replace_url": [16, 25], "replace_wrep": [16, 25], "replace_wrep_post": [16, 25], "replace_wrep_post_nonum": 25, "repositori": [21, 27], "repres": [1, 2, 9, 15, 24, 25, 26, 28], "represent": [0, 1, 2, 4, 17, 21, 24, 26, 28], "reptit": 25, "request": 23, "requir": [1, 2, 9, 18, 19, 23, 24, 25, 28, 34, 35], "research": [0, 2, 4, 5, 21, 23, 26, 27, 28, 33], "reset": 6, "reset_chat": 6, "resolv": 29, "resourc": [0, 2, 4, 7, 9, 22, 23, 26, 28], "respect": [2, 20, 28], "respons": [10, 18], "result": [12, 15, 18, 20, 26, 32], "retain": [25, 26], "retriev": [10, 17, 22], "return": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], "return_dict": [6, 11], "return_similar": 19, "return_typ": 21, "revers": [21, 26], "revis": [1, 9, 24], "revised_word": 9, "rhyme": [12, 26], "ri35": [1, 24], "ri49": [1, 24], "rise": 26, "rm": 32, "rm_bracket": [16, 25], "rm_useless_newlin": 25, "rm_useless_spac": [16, 25], "roberta": 15, "robust": [21, 26], "role": [5, 10, 22], "roll": 9, "roll_up": 9, "roman": [1, 24, 34], "root": [9, 14, 15, 22, 33], "row": 5, "royal": [1, 9, 24, 26], "royin": [1, 24], "rpre": 20, "rtg": [1, 24, 26], "rtype": [9, 16, 21, 25], "rule": [1, 12, 21, 24, 25, 26, 33], "run": [8, 10, 21, 29], "runtim": [1, 24], "rust": 21, "sa": [9, 26], "saa4": [1, 24], "safe": 21, "safe_mod": 21, "samant": [1, 24], "samat": [1, 24], "same": [8, 21], "sampl": [5, 16, 21, 32], "sample_text": 16, "samsonj": 26, "sandhi": [1, 24], "sara": 26, "sarayut": 21, "satang": 26, "satellit": 9, "save": [7, 9, 32], "sa\u02d05": [1, 24], "sa\u02d0ma\u02d0rot": [1, 24], "sc": 21, "scenario": 21, "school": 21, "sconj": 20, "score": [2, 9, 19, 26, 28], "script": [1, 21, 24, 26], "scrollto": 21, "search": [9, 17, 18, 26, 30], "second": [2, 9, 26, 28], "section": [1, 12, 24], "secur": 26, "see": [1, 5, 9, 19, 21, 24, 25, 26, 29, 34, 35], "seed": [11, 32], "seed1": 32, "seem": 33, "seen": 9, "sefr": 21, "sefr_cut": 21, "segment": [9, 16, 21, 27, 33], "select": [11, 18, 19, 21, 26], "semant": [9, 29], "sens": [9, 29, 33], "sense_label": 9, "sent": [0, 4, 9, 20, 21, 22, 26, 32], "sent_token": [19, 21], "sentenc": [0, 2, 4, 11, 15, 16, 18, 19, 20, 22, 23, 26, 28, 29, 33], "sentence_1": 21, "sentence_2": 21, "sentence_vector": [2, 28], "sentencepiec": [16, 21, 27], "sententi": 20, "sep": 17, "separ": [21, 32], "sequenc": [5, 11, 25, 26], "sequence_s": 21, "serv": [1, 23, 24, 26], "server": 9, "set": [1, 5, 9, 18, 20, 21, 24, 25, 26, 33, 34], "set_tokenize_engin": 21, "set_tokenizer_engin": 21, "setlocal": 26, "setup": 34, "sever": 11, "sft": 11, "sg": 9, "shard": 11, "share": [9, 23], "shed": 22, "short": 26, "shortest": 9, "should": [1, 12, 24, 26, 33], "show": [9, 20, 25], "show_pronunci": [1, 24], "shown": 9, "side": 34, "sign": 26, "signific": [1, 24, 26], "significantli": 5, "silenc": 12, "silent": 12, "similar": [2, 9, 17, 18, 19, 25, 26, 28, 29, 33], "simpl": [1, 10, 11, 14, 16, 24, 26], "simpli": 26, "simplifi": 23, "simul": 22, "sinc": 26, "singapor": 30, "singl": [2, 11, 18, 21, 25, 28], "sitthaa": 20, "size": [11, 13, 21, 25], "skill": 34, "skip": 11, "skip_special_token": 11, "slide": 21, "slv": 9, "small": 19, "small100": 23, "smaller": [17, 21], "smallest": 21, "smooth": 22, "snae": 17, "so": [19, 25, 34], "social": 21, "societi": [9, 20], "softwar": [30, 35], "solar": 26, "solo": 21, "solut": 26, "somchai": 17, "some": [21, 22, 26, 32, 34], "sornlertlamvanich": 20, "sort": [18, 26], "sound": [12, 26], "sound_syl": 26, "soundex": [30, 32], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 34, 35], "spa": 9, "space": [5, 16, 20, 21, 25, 26, 33], "space_token": 16, "spaceaft": 15, "spacy_thai": [15, 34], "span": 20, "spars": [2, 25, 28], "speak": 23, "speaker": [1, 24], "spec_add_spac": 25, "special": [1, 11, 21, 23, 24, 25, 26], "specif": [1, 2, 9, 17, 21, 23, 24, 25, 26, 27, 28, 35], "specifi": [9, 13, 16, 19, 20, 22, 27, 34], "speech": [0, 4, 9, 16, 20, 26, 27, 32], "speed": 27, "spell": [1, 3, 12, 21, 24, 26, 30, 34], "spell_syl": 26, "spell_word": 26, "spj": 17, "split": 21, "split_into_sent": 21, "spm": [15, 19, 27], "spooner": [1, 24], "spyll": 18, "src": 26, "src_lang": 23, "ssg": [21, 34], "stabl": 34, "stack": 21, "stackoverflow": 16, "standard": [1, 17, 18, 24, 25, 26], "start": [9, 26, 30], "start_seq": 11, "state": 21, "static": 25, "statist": [5, 17], "stativ": 20, "status": 26, "step": [5, 8, 11, 15, 25], "still": [3, 18, 26], "stop": 19, "stop_word": 19, "stopword": [9, 26], "stopwords_th": 9, "store": 34, "str": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], "str1": 26, "str2": 26, "straightforward": 26, "strategi": [16, 21], "streamlin": 25, "strftime": 26, "string": [1, 9, 11, 13, 17, 21, 24, 25, 26], "string_list": 13, "strip": [9, 12], "strong": 9, "strptime": 26, "strrftime": 26, "structur": [12, 15, 20, 26], "style": 21, "subordin": 20, "subsequ": 26, "subsum": 9, "subtract": [2, 28], "subword": [0, 4, 15, 16, 27, 32, 33], "subword_token": 21, "success": 21, "successfulli": [9, 21], "suffix": 26, "suit": [1, 24], "suitabl": [12, 17, 21, 25, 26, 29, 34], "sum": [19, 25], "summar": [21, 30, 34], "summari": 19, "summat": [2, 28], "sumonma": 26, "suntorntip": 30, "supnithi": 20, "supot": 21, "suppli": [9, 21], "support": [6, 9, 16, 20, 21, 26, 34], "sure": 26, "surfacetext": 9, "suriyawongkul": 30, "suwanvisat": 17, "swe": 9, "syllabl": [9, 12, 15, 21, 26, 32, 34], "syllable_length": 26, "syllable_open_close_detector": 26, "syllable_token": 21, "syllables_th": 9, "symbol": 26, "symposium": 21, "symspel": 18, "symspellpi": [18, 34], "synonym": [0, 4, 9], "synsets1": 9, "synsets2": 9, "system": [1, 8, 17, 24, 26, 33, 34], "syst\u00e8m": 23, "t": [1, 9, 12, 19, 20, 24, 25, 26], "t1": [9, 11], "t10": 9, "t2": 11, "t3": 11, "tab": 9, "tab_fil": 9, "tabl": 20, "tackl": 8, "tag": [0, 4, 9, 10, 15, 16, 21, 27, 30, 32], "tag_provinc": 20, "tagg": 32, "tagger": [15, 16, 21], "taglist": 21, "tailor": 26, "takahashi": 20, "take": [9, 11, 20, 26, 34], "target": [23, 26], "target_lang": 23, "task": [0, 1, 2, 4, 5, 8, 9, 10, 14, 15, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33], "taxonomi": 9, "tcc": 21, "tcc_p": 21, "tcc_po": 21, "tdtb": 20, "team": 30, "technic": 34, "techniqu": [0, 4, 5, 17, 19, 21, 25, 33], "technologi": [21, 23], "techo": 21, "ted": 21, "ted_crawl": 21, "temperatur": 11, "tensor": [25, 26], "term": [2, 9, 12, 28, 33], "terr": 33, "test": [9, 23, 25, 32], "text": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34], "text_1": 21, "text_2": 21, "text_sampl": 7, "text_to_arabic_digit": 26, "text_to_num": 26, "text_to_thai_digit": 26, "textaug": 34, "textbook": [9, 11], "textual": [0, 4, 26], "th": [0, 4, 9, 15, 17, 18, 20, 23, 25, 26], "th2en": 23, "th_blackboard": 15, "th_fr": 23, "th_th": 26, "th_zodiac": 26, "tha": 9, "thai": [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34], "thai2fit": [0, 2, 4, 28, 34], "thai2fit_wv": [2, 28, 32], "thai2rom": [1, 24, 34], "thai2rom_onnx": [1, 24], "thai2transform": 27, "thai2vec": 11, "thai_consonant_to_spel": 26, "thai_dict": 29, "thai_digit": 26, "thai_digit_to_arabic_digit": 26, "thai_nner": 20, "thai_onli": 11, "thai_stopword": 19, "thai_strftim": 26, "thai_strptim": 26, "thai_to_eng": 26, "thai_word": 21, "thai_word_tone_detector": 26, "thaidigit": 26, "thaig2p": [1, 24], "thaig2p_v2": [1, 24], "thailand": [1, 9, 17, 20, 23, 24, 26, 27], "thailand_provinces_th": 9, "thainametagg": [20, 27], "thainer": [16, 20, 27, 32], "thainer14": 20, "thainlp": [9, 32, 34], "thaisentencesegmentor": 21, "thaisum": 21, "thaisumcut": 21, "thaitext": 17, "thaitextaugment": 16, "thaitextprocessor": 16, "thaitoken": 25, "thaiword2vec": [0, 4], "thaiword_to_d": 26, "thaiword_to_num": 26, "thaiword_to_tim": 26, "than": [19, 21], "thanathip": 30, "thatphithakkul": 26, "thc43": [1, 24], "theeramunkong": 21, "thei": [1, 8, 18, 20, 21, 22, 24, 26], "them": [19, 21, 25, 26, 33, 34], "themselv": 26, "thentransl": 23, "theoret": [2, 28], "thepchai": 20, "thesi": [9, 17], "thfr": 23, "thfrtranslat": 23, "thi": [0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33, 34, 35], "thief": 9, "thiev": 9, "thing": 34, "third": [1, 24], "those": 26, "three": [11, 20], "through": [9, 21, 22], "thu": 20, "thwiki_lm": 9, "thwiki_lm_data": 25, "thwiki_model_lstm": 9, "thzh": 23, "thzhtranslat": 23, "ti": 26, "tie_weight": 25, "time": [20, 21, 26, 34], "time_data": 26, "time_to_thaiword": 26, "timeout": 9, "tini": 21, "tis620_to_utf8": 26, "titl": [20, 21], "tk_url": 16, "tltk": [1, 18, 20, 21, 24], "tltk_g2p": [1, 24], "tltk_ipa": [1, 24], "tn": 32, "tnc": [11, 18, 20], "tnc_freq": 9, "to_idna": 26, "to_lunar_d": 26, "todai": 3, "tok": [16, 25], "tok8kicsj": 26, "tok_func": [16, 25], "token": [0, 2, 4, 9, 11, 15, 16, 18, 19, 20, 25, 27, 28, 29, 30, 32, 33, 34], "tokenizer_engin": 19, "tokens_po": 20, "tomorrow": 26, "tonal": [12, 26], "tone": [1, 17, 21, 24, 26], "tone_detector": 26, "tone_to_spel": 26, "tonemark": 26, "too": 12, "tool": [1, 5, 8, 11, 12, 14, 18, 21, 23, 24, 25, 26, 27, 29, 30, 34], "toolkit": [1, 12, 20, 24], "toolset": [0, 4], "top": [2, 11, 26, 28], "top_k": [11, 26], "top_p": 11, "torch": [6, 11, 25], "torch_dtyp": [6, 11], "total": 32, "total_words_in_ref_sampl": 32, "total_words_in_sampl": 32, "tp": 32, "train": [2, 7, 9, 11, 15, 21, 23, 25, 28, 33], "training_data": [7, 9], "transcript": [1, 24, 26], "transform": [25, 26, 27, 29], "transformers_ud": [15, 34], "transformersud": 15, "translat": [12, 26, 29, 30, 33, 34], "transliter": [9, 17, 26, 30, 34], "travel": 26, "treasur": 26, "tree": 15, "treebank": 20, "trie": [9, 21, 26], "trigram": 9, "trn_arg": 25, "trove": 26, "true": [0, 1, 2, 4, 5, 6, 9, 11, 12, 14, 16, 19, 20, 21, 24, 25, 26, 27, 28, 34], "trueplookpanya": 14, "truth": 5, "try": [26, 34], "ttc": 11, "ttc_freq": [9, 22], "tud": 20, "tune": [21, 25, 27], "tupl": [0, 2, 4, 7, 9, 13, 16, 18, 19, 20, 26, 27, 28, 29], "tutori": 30, "two": [2, 3, 9, 11, 12, 14, 19, 20, 23, 26, 28, 33, 34], "txt": [5, 9, 21, 22, 25, 32], "txtt": 21, "type": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 34], "typic": 11, "typical_p": 11, "typo": 26, "tzinfo": 26, "u": 26, "uci": 34, "ud": [15, 20], "ud_goeswith": 15, "udom83": 32, "udomcharoenchaikit": 30, "udompanich": 17, "ulmfit": [30, 34], "umlfit": 34, "unabl": 9, "uncas": [19, 27], "under": [19, 34, 35], "underlin": [2, 28], "underscor": 26, "understand": [0, 4, 8, 9, 22, 23, 26, 29, 33], "unexpect": 26, "ungroup": 25, "ungroup_emoji": 25, "unicod": [1, 21, 24, 26, 34], "unicodedata": 26, "unifi": 23, "uniform": 25, "unigram": [9, 19], "union": [9, 10, 12, 15, 16, 20, 21, 26, 27], "uniqu": [0, 1, 4, 21, 24, 26], "unit": [19, 20, 21, 26, 27], "univers": [15, 17, 20, 23, 25, 35], "universaldepend": 20, "unix": 26, "unless": 35, "unnecessari": 25, "unrel": [2, 28], "unreli": 21, "unsupervis": 29, "unwant": 26, "up": [0, 1, 4, 5, 16, 24], "updat": 25, "upgrad": 34, "upo": 15, "upon": [16, 27], "url": [9, 16, 20], "us": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35], "usabl": 23, "usag": [0, 4, 6, 25], "use_gpu": 23, "use_mean": [2, 28], "useless": 16, "user": [9, 11, 12, 17, 21, 22, 26, 32], "user1": 32, "usr": [9, 22], "usual": 25, "utenpattanun": 21, "utf": [21, 26], "util": [1, 5, 11, 12, 14, 18, 21, 22, 24, 25, 27, 30, 34], "uua": 26, "v": [0, 4, 9, 21, 26], "v0": [2, 28], "v1": [1, 2, 8, 24, 28], "v2": [1, 15, 20, 24, 27, 29], "v6": 18, "vact": 20, "valid": [12, 18], "valu": [9, 11, 13, 17, 18, 20, 21, 26], "valuabl": [0, 2, 4, 18, 21, 22, 23, 26, 28], "vari": 19, "variabl": [22, 34], "variat": [0, 4, 21], "varieti": [0, 4], "variou": [1, 2, 5, 9, 10, 11, 17, 21, 23, 24, 26, 27, 28], "vatt": 20, "vector": [0, 2, 4, 17, 19, 25, 28, 33, 34], "vehicl": [2, 28], "verb": [9, 15, 16, 20], "verbal": 20, "verif": 12, "verifi": 12, "versatil": [1, 21, 24, 26], "version": [0, 4, 9, 20, 34, 35], "vi": [1, 24], "vichit": 17, "vietnames": [1, 24], "view": 26, "virach": 20, "visibl": 26, "vistec": [21, 23, 27], "visual": [26, 34], "vital": [22, 25, 26], "vocab": 25, "vocabulari": [2, 16, 28], "vocaburai": 21, "vol": 20, "vorapon": 20, "vote": 9, "vowel": [3, 12, 26], "vp": [17, 20], "vsta": [15, 20], "vv": 20, "w": [19, 21, 26, 32], "w1": 14, "w14": [2, 28], "w2": 14, "w2p": [1, 24], "wa": [17, 21, 26], "wai": 11, "wait": 21, "wanchanberta": 18, "wanchanberta_thai_grammarli": 18, "wangchanberta": [0, 4, 16, 19, 20, 21, 30, 34], "wangchanglm": [6, 34], "wannaphong": [1, 21, 24, 30], "wanne": 17, "want": [0, 4, 5, 7, 8, 9, 11, 15, 26], "warn": 26, "warranti": 35, "we": [2, 5, 6, 7, 9, 20, 26, 28, 29, 34], "web": 23, "websit": [23, 30], "weekdai": 26, "weight": [9, 25], "weight_p": 25, "welcom": 25, "well": [9, 16, 25], "western": 26, "wgt": 25, "what": 16, "wheel": 34, "when": [0, 2, 4, 8, 9, 19, 26, 28, 29, 33], "where": [13, 16, 22, 25, 26, 29, 33, 34], "whether": [1, 5, 14, 16, 18, 24], "which": [0, 1, 2, 4, 5, 9, 12, 15, 17, 18, 19, 21, 22, 24, 25, 26, 28], "while": 22, "white": 5, "whitespac": [9, 21, 26], "whl": 34, "who": 26, "whose": 18, "wide": [1, 24], "width": [25, 26], "wiki": 26, "wiki_lm_lstm": 9, "wikidata": 10, "wikiparsec": 9, "wikipedia": [15, 26], "wiktionari": [9, 29], "win32": 34, "wind": 9, "window": [2, 21, 28, 34], "wisesight": 32, "within": [0, 4, 8, 10, 16, 18, 20, 21, 22, 25, 27, 29, 34], "without": [1, 18, 20, 21, 24, 25, 26, 35], "wittawat": 21, "word": [0, 1, 2, 3, 4, 5, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, 32, 33, 34], "word1": [2, 12, 28], "word2": [2, 12, 28], "word2vec": [0, 2, 4, 28], "word2veckeyedvector": [2, 28], "word_detoken": 21, "word_level": 32, "word_list": 26, "word_rank": 16, "word_token": [0, 4, 5, 16, 19, 21, 25, 33], "word_vector": 30, "wordlist": 9, "wordnet": [0, 4, 34], "words_th": [9, 21], "words_to_num": 26, "wordvector": [2, 28], "work": [0, 2, 4, 9, 12, 16, 17, 18, 21, 22, 25, 26, 28], "workshop": 30, "would": [20, 21, 33], "wrapper": [18, 20, 21, 25], "write": [3, 18, 21, 26, 33, 35], "wrong": 26, "wrongli": 21, "ws1000": 21, "wsd": [30, 34], "wt": [1, 24], "wtp": 21, "wtpsplitax": 21, "wu": 9, "wunsen": [1, 24], "wunsentransliter": [1, 24], "wup": 9, "wv": [2, 28], "www": [2, 9, 14, 17, 18, 20, 21, 26, 28, 34, 35], "x": [5, 20, 26], "x0b": 26, "x0c0123456789": 26, "x1": 7, "xin": [1, 24], "xitgmlwmp": 26, "xl": 19, "xn": 26, "xvae": 20, "xvam": 20, "xvbb": 20, "xvbm": 20, "xvmm": 20, "xx": [20, 23], "xxl": 19, "xxrep": 25, "xxwrep": 25, "xxx": 34, "y": 26, "yamok": 26, "year": [21, 26], "yet": [9, 22, 26], "yoav": [2, 28], "you": [0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 15, 20, 21, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], "your": [0, 1, 2, 4, 9, 21, 24, 28, 29, 30, 34], "yuanghirun": 21, "zero": [25, 26, 35], "zh": [1, 23, 24], "zh_cn": 23, "zh_sandhi": [1, 24], "zh_th": 23, "zhou": 21, "zhth": 23, "zhthtranslat": 23, "zip": 20, "zodiac": 26, "zoneinfo": 26, "zsm": 9, "zvj": 25, "zwj": 25, "zwjp": 26, "zwsp": 26, "\u00e3\u00e3\u00e1": 26, "\u00e3\u00e7": 26, "\u00e3\u00f0": 26, "\u00ed\u00f8\u00b5\u00ea\u00f2\u00eb": 26, "\u0101m\u0101rt": [1, 24], "\u0e01": [1, 3, 6, 7, 9, 11, 17, 18, 19, 20, 21, 24, 26, 29, 32, 33], "\u0e01\u0e01": [9, 29], "\u0e01\u0e01\u0e02\u0e19\u0e32\u0e01": 9, "\u0e01\u0e02\u0e23\u0e27": [1, 24], "\u0e01\u0e02\u0e23\u0e30\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22\u0e17": 17, "\u0e01\u0e04": 23, "\u0e01\u0e04\u0e33\u0e1e": 17, "\u0e01\u0e07\u0e32\u0e19\u0e23\u0e32\u0e0a\u0e1a": [1, 24], "\u0e01\u0e07\u0e32\u0e19\u0e40\u0e07": [2, 28], "\u0e01\u0e08\u0e33\u0e04": 32, "\u0e01\u0e14\u0e14": 9, "\u0e01\u0e15": [8, 9], "\u0e01\u0e19": [19, 20], "\u0e01\u0e1b": 32, "\u0e01\u0e1c": 11, "\u0e01\u0e20\u0e32\u0e29\u0e32\u0e16": 33, "\u0e01\u0e20\u0e32\u0e29\u0e32\u0e1a": 21, "\u0e01\u0e21": 20, "\u0e01\u0e23": [8, 9, 10, 19, 20, 32], "\u0e01\u0e23\u0e21\u0e1e\u0e23\u0e30\u0e19\u0e40\u0e23\u0e28\u0e23\u0e27\u0e23\u0e24\u0e17\u0e18": 19, "\u0e01\u0e23\u0e21\u0e1e\u0e23\u0e30\u0e19\u0e40\u0e23\u0e28\u0e27\u0e23\u0e24\u0e17": 19, "\u0e01\u0e23\u0e21\u0e27": 20, "\u0e01\u0e23\u0e23\u0e21\u0e01\u0e32\u0e23\u0e01": 11, "\u0e01\u0e23\u0e2d\u0e1a": [9, 12, 29], "\u0e01\u0e23\u0e30\u0e17": 32, "\u0e01\u0e23\u0e30\u0e17\u0e23\u0e27\u0e07\u0e2d": 26, "\u0e01\u0e23\u0e30\u0e1a": [9, 18], "\u0e01\u0e23\u0e30\u0e1b": 18, "\u0e01\u0e23\u0e32\u0e0a": 22, "\u0e01\u0e25": [7, 16, 20, 26], "\u0e01\u0e25\u0e21": 33, "\u0e01\u0e27": [2, 12, 20, 28], "\u0e01\u0e29": 17, "\u0e01\u0e29\u0e13": 26, "\u0e01\u0e29\u0e23": 26, "\u0e01\u0e29\u0e23\u0e25\u0e30\u0e15": [1, 24], "\u0e01\u0e29\u0e23\u0e2b": 3, "\u0e01\u0e29\u0e23\u0e41\u0e25\u0e30": 26, "\u0e01\u0e29\u0e32\u0e04": 21, "\u0e01\u0e2a\u0e1a\u0e32\u0e22\u0e21\u0e32\u0e01": 8, "\u0e01\u0e2b\u0e23\u0e2d": 19, "\u0e01\u0e2d": 26, "\u0e01\u0e30": 32, "\u0e01\u0e30\u0e1b": 18, "\u0e01\u0e32": [2, 9, 16, 21, 25, 26, 28, 32], "\u0e01\u0e32xxrep7": 25, "\u0e01\u0e32\u0e0d\u0e08\u0e19\u0e1a": 9, "\u0e01\u0e32\u0e22\u0e19": 26, "\u0e01\u0e32\u0e23": [2, 9, 20, 21, 26, 28, 32], "\u0e01\u0e32\u0e23\u0e13": [18, 26], "\u0e01\u0e32\u0e23\u0e16": [1, 24], "\u0e01\u0e32\u0e23\u0e17": [1, 24], "\u0e01\u0e32\u0e23\u0e17\u0e33\u0e07\u0e32\u0e19": 32, "\u0e01\u0e32\u0e23\u0e23": 9, "\u0e01\u0e32\u0e23\u0e40\u0e0a": 12, "\u0e01\u0e32\u0e23\u0e40\u0e25": 9, "\u0e01\u0e32\u0e23\u0e41\u0e1e\u0e17\u0e22": 20, "\u0e01\u0e32\u0e23\u0e41\u0e22": 7, "\u0e01\u0e32\u0e23\u0e41\u0e2a\u0e14\u0e07": 21, "\u0e01\u0e32\u0e23\u0e43\u0e0a": 17, "\u0e01\u0e32\u0e25": 26, "\u0e01\u0e32\u0e25\u0e40\u0e27\u0e25\u0e32": 26, "\u0e01\u0e32\u0e2a": 26, "\u0e01\u0e32\u0e2a\u0e32\u0e21\u0e19\u0e32\u0e17": 26, "\u0e01\u0e32\u0e32\u0e32\u0e32\u0e32\u0e32\u0e32": [16, 25], "\u0e01\u0e33\u0e21": 20, "\u0e01\u0e33\u0e25": 20, "\u0e01\u0e33\u0e41\u0e1e\u0e07\u0e40\u0e1e\u0e0a\u0e23": 20, "\u0e01\u0e40\u0e01\u0e13\u0e11": [1, 24], "\u0e01\u0e40\u0e09": 32, "\u0e01\u0e40\u0e14": 19, "\u0e01\u0e40\u0e25": 11, "\u0e01\u0e41\u0e21\u0e27": 23, "\u0e01\u0e42\u0e01\u0e23\u0e18": 7, "\u0e01\u0e43\u0e19\u0e02": 32, "\u0e01\u0e43\u0e2b": 11, "\u0e01\u0e44\u0e17\u0e22": [2, 28], "\u0e02": [7, 9, 20, 21, 26], "\u0e02\u0e13\u0e30": 9, "\u0e02\u0e19\u0e21\u0e0a": 21, "\u0e02\u0e2d\u0e07": [20, 21], "\u0e02\u0e2d\u0e07\u0e01\u0e32\u0e23\u0e1e": 21, "\u0e02\u0e2d\u0e07\u0e40\u0e02\u0e32": 8, "\u0e02\u0e2d\u0e07\u0e40\u0e08": 19, "\u0e02\u0e2d\u0e07\u0e40\u0e23\u0e32\u0e19": 26, "\u0e02\u0e2d\u0e07\u0e41\u0e02": 19, "\u0e02\u0e2d\u0e27": 11, "\u0e02\u0e2d\u0e2d\u0e20": 21, "\u0e02\u0e32": 20, "\u0e04": [11, 14, 17, 18, 20, 21, 26, 29, 32], "\u0e04\u0e04\u0e33\u0e40\u0e2d\u0e01\u0e42\u0e17": 12, "\u0e04\u0e0a\u0e40\u0e2a\u0e19": 19, "\u0e04\u0e0b": 8, "\u0e04\u0e13\u0e30\u0e01\u0e23\u0e23\u0e21\u0e01\u0e32\u0e23\u0e2a": [1, 24], "\u0e04\u0e19": [15, 20, 25, 26], "\u0e04\u0e19\u0e08": 12, "\u0e04\u0e19\u0e14": [15, 26], "\u0e04\u0e19\u0e25\u0e30\u0e04\u0e23": 26, "\u0e04\u0e19\u0e44\u0e17\u0e22": [2, 28], "\u0e04\u0e19\u0e46\u0e19\u0e01": 26, "\u0e04\u0e23": [11, 16, 18, 20, 26], "\u0e04\u0e23\u0e23\u0e32\u0e0a\u0e17": [2, 28], "\u0e04\u0e23\u0e40\u0e23": [0, 4], "\u0e04\u0e25": [18, 20], "\u0e04\u0e25\u0e2d\u0e07": [2, 28], "\u0e04\u0e27\u0e1a\u0e04": 19, "\u0e04\u0e27\u0e23": [20, 32], "\u0e04\u0e27\u0e32\u0e21": [20, 21], "\u0e04\u0e27\u0e32\u0e21\u0e23": 8, "\u0e04\u0e27\u0e32\u0e21\u0e41\u0e1b\u0e25\u0e01\u0e41\u0e22\u0e01\u0e41\u0e25\u0e30": 21, "\u0e04\u0e27\u0e32\u0e21\u0e41\u0e1b\u0e25\u0e01\u0e41\u0e22\u0e01\u0e41\u0e25\u0e30\u0e1e": 21, "\u0e04\u0e2b": 14, "\u0e04\u0e2d": 26, "\u0e04\u0e30": 26, "\u0e04\u0e33": 32, "\u0e04\u0e33\u0e44\u0e17\u0e22\u0e41\u0e17": 14, "\u0e04\u0e40\u0e23": 21, "\u0e07": [0, 4, 7, 9, 11, 12, 17, 18, 19, 20, 21, 26, 32], "\u0e07\u0e01\u0e24\u0e29": [2, 28], "\u0e07\u0e01\u0e32\u0e22\u0e2d\u0e22": 11, "\u0e07\u0e02\u0e19\u0e21\u0e2b\u0e27\u0e32\u0e19\u0e43\u0e19\u0e15\u0e33\u0e19\u0e32\u0e19\u0e17": 19, "\u0e07\u0e04": [3, 14], "\u0e07\u0e04\u0e30\u0e19\u0e2d\u0e07": 12, "\u0e07\u0e04\u0e32\u0e23": 26, "\u0e07\u0e07\u0e32\u0e19\u0e41\u0e25\u0e30\u0e04\u0e27\u0e32\u0e21\u0e23": 19, "\u0e07\u0e08\u0e33\u0e1e\u0e27\u0e01\u0e02\u0e19\u0e21\u0e40\u0e04": 29, "\u0e07\u0e14": 7, "\u0e07\u0e15\u0e23\u0e07\u0e44\u0e1b\u0e15\u0e23\u0e07\u0e21\u0e32": 9, "\u0e07\u0e16\u0e19\u0e19\u0e1e\u0e23\u0e30\u0e2d\u0e32\u0e17": 19, "\u0e07\u0e17\u0e33\u0e15": 11, "\u0e07\u0e19": [2, 19, 26, 28], "\u0e07\u0e1a\u0e32\u0e17\u0e16": 26, "\u0e07\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22": 26, "\u0e07\u0e1b\u0e27\u0e07\u0e21\u0e32": 21, "\u0e07\u0e1e": 26, "\u0e07\u0e1f": 26, "\u0e07\u0e23": 26, "\u0e07\u0e23\u0e16\u0e40\u0e21\u0e25": 26, "\u0e07\u0e23\u0e2d\u0e22\u0e2a": 26, "\u0e07\u0e2a": [0, 4, 20, 32], "\u0e07\u0e2b\u0e21": 32, "\u0e07\u0e2b\u0e21\u0e14": [0, 4, 16, 20, 26], "\u0e07\u0e2b\u0e23": 19, "\u0e07\u0e2b\u0e27": [2, 28], "\u0e07\u0e2b\u0e32\u0e22": 7, "\u0e07\u0e2d\u0e1a\u0e02\u0e19\u0e21\u0e04": 29, "\u0e07\u0e2d\u0e22": 19, "\u0e07\u0e32": 32, "\u0e07\u0e32\u0e19": [26, 29, 32], "\u0e07\u0e40\u0e01\u0e15": 18, "\u0e07\u0e40\u0e04\u0e23\u0e32\u0e30\u0e2b": 21, "\u0e07\u0e40\u0e17\u0e1e": 8, "\u0e07\u0e40\u0e17\u0e1e\u0e21\u0e2b\u0e32\u0e19\u0e04\u0e23": [9, 19], "\u0e07\u0e40\u0e17\u0e1e\u0e40\u0e1b": 10, "\u0e07\u0e40\u0e23": 9, "\u0e07\u0e40\u0e25": 15, "\u0e07\u0e40\u0e28\u0e2a": [2, 28], "\u0e07\u0e40\u0e2a": 12, "\u0e07\u0e40\u0e2d": 12, "\u0e07\u0e41\u0e21": [2, 28], "\u0e07\u0e41\u0e23\u0e01\u0e43\u0e19\u0e01": 22, "\u0e07\u0e41\u0e23\u0e01\u0e43\u0e19\u0e1e": 22, "\u0e07\u0e42\u0e22\u0e04": 14, "\u0e07\u0e43\u0e04\u0e23": 21, "\u0e07\u0e43\u0e2a": 19, "\u0e07\u0e43\u0e2a\u0e40\u0e1b": 19, "\u0e07\u0e44\u0e21": 9, "\u0e07\u0e44\u0e25": 12, "\u0e08": [1, 2, 3, 6, 8, 9, 10, 17, 20, 21, 24, 25, 26, 28, 32], "\u0e08\u0e01\u0e01": 3, "\u0e08\u0e07": 20, "\u0e08\u0e08": [17, 26], "\u0e08\u0e1e\u0e23\u0e30\u0e08": 19, "\u0e08\u0e1e\u0e23\u0e30\u0e08\u0e2d\u0e21\u0e40\u0e01\u0e25": 19, "\u0e08\u0e23": [14, 20], "\u0e08\u0e2d\u0e07": 12, "\u0e08\u0e30": 21, "\u0e08\u0e30\u0e17\u0e33\u0e43\u0e2b": 19, "\u0e08\u0e30\u0e21\u0e32\u0e19": 26, "\u0e08\u0e32\u0e01": [16, 20], "\u0e08\u0e33\u0e19\u0e27\u0e19": [20, 26], "\u0e08\u0e33\u0e25\u0e2d\u0e07": 9, "\u0e09": [0, 4, 7, 12, 16, 20, 21, 23, 32], "\u0e0a": [0, 4, 16, 17, 18, 19, 20, 21, 26, 29], "\u0e0a\u0e01\u0e32\u0e23\u0e1b": 26, "\u0e0a\u0e01\u0e32\u0e25\u0e1b": 26, "\u0e0a\u0e0a": 9, "\u0e0a\u0e22": 17, "\u0e0a\u0e2a\u0e21": 19, "\u0e0a\u0e2d\u0e1a": [0, 4], "\u0e0a\u0e2d\u0e1a\u0e46": 7, "\u0e0a\u0e32\u0e27\u0e0a": 32, "\u0e0a\u0e32\u0e27\u0e44\u0e17\u0e22": [2, 28], "\u0e0b": [1, 9, 19, 20, 24, 26], "\u0e0d": [2, 17, 20, 28], "\u0e0d\u0e04\u0e27\u0e23\u0e2b\u0e25": 11, "\u0e0d\u0e08\u0e23": 14, "\u0e0d\u0e0a": 20, "\u0e0d\u0e0a\u0e32": 18, "\u0e0d\u0e0d\u0e32": [18, 25], "\u0e0d\u0e18\u0e23\u0e23\u0e21": 9, "\u0e0d\u0e28": 9, "\u0e0d\u0e2b\u0e32": 18, "\u0e0d\u0e2b\u0e32\u0e22": 32, "\u0e0d\u0e40\u0e15": 19, "\u0e0d\u0e43\u0e19\u0e01\u0e32\u0e23\u0e2a\u0e23": 9, "\u0e10\u0e18\u0e23\u0e23\u0e21\u0e19": 20, "\u0e10\u0e21\u0e19\u0e15\u0e23": [2, 28], "\u0e10\u0e2d\u0e40\u0e21\u0e23": [2, 28], "\u0e10\u0e32\u0e19": 14, "\u0e12": [2, 21, 28], "\u0e12\u0e19": 9, "\u0e12\u0e19\u0e18\u0e23\u0e23\u0e21": 19, "\u0e12\u0e19\u0e32\u0e01\u0e32\u0e23": 21, "\u0e13": [20, 21, 23], "\u0e13\u0e10\u0e32\u0e19": 14, "\u0e13\u0e11": [1, 24], "\u0e13\u0e19\u0e30": 23, "\u0e13\u0e2b": 17, "\u0e13\u0e32": 20, "\u0e14": [0, 1, 2, 4, 7, 9, 15, 18, 19, 20, 21, 24, 26, 28, 32], "\u0e14\u0e16": 19, "\u0e14\u0e17": 20, "\u0e14\u0e19\u0e32\u0e17": 26, "\u0e14\u0e1a\u0e32\u0e17\u0e16": 26, "\u0e14\u0e1b\u0e01\u0e15": 21, "\u0e14\u0e1b\u0e23\u0e30\u0e0a\u0e32\u0e0a\u0e19\u0e14": 26, "\u0e14\u0e1e\u0e25": 19, "\u0e14\u0e21\u0e1e\u0e32\u0e13": 17, "\u0e14\u0e25": [9, 26], "\u0e14\u0e25\u0e32\u0e19\u0e2a": 26, "\u0e14\u0e2b\u0e19": 29, "\u0e14\u0e2d": [17, 26], "\u0e14\u0e2d\u0e01\u0e44\u0e21": 12, "\u0e14\u0e32\u0e27\u0e40\u0e23": 9, "\u0e14\u0e40\u0e01": 26, "\u0e14\u0e41\u0e04\u0e27": [2, 28], "\u0e14\u0e41\u0e25": 19, "\u0e14\u0e42\u0e17\u0e29": 19, "\u0e14\u0e42\u0e17\u0e29\u0e15": 19, "\u0e14\u0e46": 19, "\u0e15": [0, 1, 2, 4, 9, 11, 14, 16, 17, 18, 19, 20, 21, 24, 26, 28, 29, 32], "\u0e15\u0e01\u0e41\u0e15": 9, "\u0e15\u0e21": 25, "\u0e15\u0e22": [19, 26], "\u0e15\u0e22\u0e2a\u0e16\u0e32\u0e19": [1, 24], "\u0e15\u0e22\u0e2a\u0e20\u0e32": [1, 24], "\u0e15\u0e23": [9, 26], "\u0e15\u0e23\u0e1a": 9, "\u0e15\u0e23\u0e20\u0e32\u0e1a": 18, "\u0e15\u0e23\u0e20\u0e32\u0e1e": 18, "\u0e15\u0e23\u0e30\u0e01": 9, "\u0e15\u0e23\u0e40\u0e08": 19, "\u0e15\u0e27\u0e32": 26, "\u0e15\u0e2a\u0e32\u0e2b\u0e01\u0e23\u0e23\u0e21": 26, "\u0e15\u0e2a\u0e33\u0e2b\u0e23": 29, "\u0e15\u0e2d\u0e19": 20, "\u0e15\u0e2d\u0e19\u0e2b": 16, "\u0e15\u0e2d\u0e25\u0e2e": 20, "\u0e15\u0e30\u0e27": 14, "\u0e15\u0e32": 33, "\u0e15\u0e32\u0e01": 33, "\u0e15\u0e33\u0e2b\u0e19": 19, "\u0e15\u0e43\u0e08": 9, "\u0e16": [19, 20, 26, 33], "\u0e16\u0e27": 9, "\u0e16\u0e2d\u0e14\u0e40\u0e2a": [1, 24], "\u0e17": [0, 4, 6, 9, 11, 16, 19, 20, 21, 26, 32], "\u0e17\u0e14\u0e2a\u0e2d\u0e1a": [16, 20], "\u0e17\u0e14\u0e2a\u0e2d\u0e1a\u0e19\u0e32\u0e22\u0e1b\u0e27\u0e23": 16, "\u0e17\u0e14\u0e2a\u0e2d\u0e1a\u0e19\u0e32\u0e22\u0e27\u0e23\u0e23\u0e13\u0e1e\u0e07\u0e29": 20, "\u0e17\u0e14\u0e2a\u0e2d\u0e1a\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22": 26, "\u0e17\u0e14\u0e2a\u0e2d\u0e1a\u0e23\u0e30\u0e1a\u0e1a": 23, "\u0e17\u0e14\u0e2a\u0e2d\u0e1a\u0e23\u0e30\u0e1a\u0e1a\u0e40\u0e27\u0e25\u0e32": 20, "\u0e17\u0e17": 20, "\u0e17\u0e18\u0e28": 22, "\u0e17\u0e22\u0e32\u0e19": 17, "\u0e17\u0e22\u0e32\u0e25": [9, 17], "\u0e17\u0e22\u0e32\u0e28\u0e32\u0e2a\u0e15\u0e23": 20, "\u0e17\u0e2d\u0e07": 12, "\u0e17\u0e2d\u0e07\u0e08": 20, "\u0e17\u0e2d\u0e07\u0e2d": 9, "\u0e17\u0e2d\u0e40\u0e23": 19, "\u0e17\u0e30\u0e40\u0e25": [2, 28], "\u0e17\u0e30\u0e40\u0e25\u0e2a\u0e32\u0e1a": [2, 28], "\u0e17\u0e32\u0e07": 20, "\u0e17\u0e32\u0e07\u0e08": 9, "\u0e17\u0e33": 32, "\u0e17\u0e33\u0e01\u0e32\u0e23\u0e28": 21, "\u0e17\u0e33\u0e07\u0e32\u0e19": [9, 20], "\u0e17\u0e33\u0e07\u0e32\u0e19\u0e44\u0e14": 19, "\u0e17\u0e33\u0e2a": 7, "\u0e17\u0e33\u0e40\u0e19": 19, "\u0e17\u0e33\u0e40\u0e1b": 29, "\u0e17\u0e33\u0e43\u0e2b": 19, "\u0e17\u0e41\u0e2d\u0e1b\u0e40\u0e1b": 10, "\u0e18": [1, 11, 19, 24, 26], "\u0e18\u0e07_\u0e44\u0e17\u0e22": 26, "\u0e18\u0e19\u0e32\u0e04\u0e32\u0e23\u0e41\u0e2b": 26, "\u0e18\u0e23\u0e23\u0e21\u0e14\u0e32": 20, "\u0e19": [0, 1, 2, 3, 4, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 28, 32, 33], "\u0e191": 21, "\u0e19\u0e01": [2, 20, 25, 26, 28], "\u0e19\u0e04": 11, "\u0e19\u0e04\u0e19\u0e14": 15, "\u0e19\u0e04\u0e23": 16, "\u0e19\u0e04\u0e27": 21, "\u0e19\u0e04\u0e33\u0e44\u0e17\u0e22\u0e15\u0e32\u0e21\u0e40\u0e2a": 17, "\u0e19\u0e08": 26, "\u0e19\u0e08\u0e33\u0e19\u0e27\u0e19": 26, "\u0e19\u0e0a": [12, 29], "\u0e19\u0e0a\u0e2d\u0e1a\u0e01": 12, "\u0e19\u0e0b": 10, "\u0e19\u0e14": [6, 7, 16, 25, 26], "\u0e19\u0e15\u0e21\u0e22\u0e1b": 25, "\u0e19\u0e15\u0e23\u0e07": 18, "\u0e19\u0e15\u0e23\u0e1a": 18, "\u0e19\u0e17": [19, 20, 21, 26], "\u0e19\u0e17\u0e23": [9, 16, 17, 20, 26], "\u0e19\u0e18": [9, 17, 26], "\u0e19\u0e19": [14, 25], "\u0e19\u0e19\u0e17\u0e1a": 9, "\u0e19\u0e19\u0e22": [1, 24], "\u0e19\u0e1a": [9, 19, 20], "\u0e19\u0e1a\u0e17\u0e04\u0e27\u0e32\u0e21\u0e2b\u0e19": 7, "\u0e19\u0e1a\u0e32\u0e1a": 7, "\u0e19\u0e1b": [19, 26], "\u0e19\u0e1b\u0e23\u0e30\u0e18\u0e32\u0e19\u0e18": 20, "\u0e19\u0e1b\u0e23\u0e30\u0e27": [1, 24], "\u0e19\u0e21": 32, "\u0e19\u0e21\u0e23\u0e14\u0e01\u0e15\u0e01\u0e17\u0e2d\u0e14\u0e21\u0e32\u0e16": 19, "\u0e19\u0e21\u0e32\u0e08\u0e32\u0e01\u0e1c\u0e25\u0e07\u0e32\u0e19\u0e27": 21, "\u0e19\u0e22": [9, 26, 32], "\u0e19\u0e22\u0e32\u0e22\u0e19": 21, "\u0e19\u0e23": [7, 23], "\u0e19\u0e23\u0e30\u0e22\u0e30": 21, "\u0e19\u0e23\u0e32": [1, 24], "\u0e19\u0e27\u0e32\u0e04\u0e21": 26, "\u0e19\u0e28": 9, "\u0e19\u0e2a\u0e01": 19, "\u0e19\u0e2a\u0e19\u0e32\u0e19": 20, "\u0e19\u0e2a\u0e32\u0e21\u0e1e": 26, "\u0e19\u0e2b\u0e01\u0e23": 26, "\u0e19\u0e2b\u0e01\u0e23\u0e2d\u0e22\u0e2a": 26, "\u0e19\u0e2b\u0e19": 26, "\u0e19\u0e2b\u0e21\u0e2d\u0e44\u0e14": 7, "\u0e19\u0e2b\u0e23": 19, "\u0e19\u0e2b\u0e32\u0e0a": 17, "\u0e19\u0e2d": [1, 24, 26], "\u0e19\u0e2d\u0e23": 18, "\u0e19\u0e2d\u0e30\u0e44\u0e23\u0e17": 19, "\u0e19\u0e2d\u0e32\u0e01\u0e32\u0e23": 21, "\u0e19\u0e2d\u0e32\u0e01\u0e32\u0e23\u0e1c": 21, "\u0e19\u0e2d\u0e32\u0e17": 26, "\u0e19\u0e2d\u0e32\u0e2b\u0e32\u0e23": 11, "\u0e19\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e0d": 26, "\u0e19\u0e30": 20, "\u0e19\u0e30\u0e04\u0e23": [7, 26], "\u0e19\u0e32": 21, "\u0e19\u0e32\u0e01\u0e32\u0e23": 21, "\u0e19\u0e32\u0e04\u0e21": [21, 26], "\u0e19\u0e32\u0e07\u0e1b\u0e23\u0e30\u0e19\u0e2d\u0e21": 20, "\u0e19\u0e32\u0e17": 26, "\u0e19\u0e32\u0e19": 25, "\u0e19\u0e32\u0e19\u0e19\u0e32\u0e19\u0e19\u0e32\u0e19": 25, "\u0e19\u0e32\u0e19\u0e32": 26, "\u0e19\u0e32\u0e19\u0e32\u0e32\u0e32": 26, "\u0e19\u0e32\u0e22": [8, 20], "\u0e19\u0e32\u0e22\u0e01": [2, 20, 28], "\u0e19\u0e32\u0e22\u0e01\u0e23": [2, 28], "\u0e19\u0e32\u0e22\u0e19": 26, "\u0e19\u0e32\u0e22\u0e1b\u0e27\u0e23": 16, "\u0e19\u0e32\u0e22\u0e27\u0e23\u0e23\u0e13\u0e1e\u0e07\u0e29": 20, "\u0e19\u0e32\u0e23": [1, 24], "\u0e19\u0e32\u0e2c": 26, "\u0e19\u0e40\u0e0b": 11, "\u0e19\u0e40\u0e14": [2, 28], "\u0e19\u0e40\u0e17\u0e2d\u0e23": [18, 29], "\u0e19\u0e40\u0e1f": 32, "\u0e19\u0e40\u0e21": 10, "\u0e19\u0e40\u0e25": 29, "\u0e19\u0e40\u0e27": 21, "\u0e19\u0e40\u0e2a\u0e32\u0e23": 26, "\u0e19\u0e41\u0e1b\u0e14\u0e23": 26, "\u0e19\u0e41\u0e21\u0e19\u0e14\u0e32\u0e23": [1, 24], "\u0e19\u0e41\u0e2d\u0e25\u0e1e": 26, "\u0e19\u0e42\u0e0b": 21, "\u0e19\u0e42\u0e14\u0e27\u0e2a": 20, "\u0e19\u0e43\u0e08\u0e17": 19, "\u0e19\u0e43\u0e08\u0e2a": 19, "\u0e19\u0e43\u0e19\u0e04\u0e33": 32, "\u0e19\u0e43\u0e19\u0e1e\u0e23\u0e30\u0e1a\u0e32\u0e17\u0e2a\u0e21\u0e40\u0e14": 19, "\u0e19\u0e44\u0e01": 12, "\u0e19\u0e44\u0e1b\u0e1b\u0e23\u0e30\u0e0a": 21, "\u0e1a": [3, 7, 9, 11, 16, 17, 18, 19, 20, 21, 25, 26, 32, 33], "\u0e1a551": 17, "\u0e1a5515": 17, "\u0e1a55150": 17, "\u0e1a931900": 17, "\u0e1ae419": 17, "\u0e1a\u0e01\u0e32\u0e23\u0e2b\u0e21": 21, "\u0e1a\u0e02": 29, "\u0e1a\u0e02\u0e2d\u0e07\u0e1e\u0e23\u0e30\u0e40\u0e08": 19, "\u0e1a\u0e04": [1, 17, 24], "\u0e1a\u0e07\u0e07\u0e04\u0e1a\u0e1a": 3, "\u0e1a\u0e08": 26, "\u0e1a\u0e08\u0e2d\u0e07": 12, "\u0e1a\u0e0a": 9, "\u0e1a\u0e14": 19, "\u0e1a\u0e17\u0e04\u0e27\u0e32\u0e21\u0e19": 21, "\u0e1a\u0e19": [0, 4, 16, 20, 29], "\u0e1a\u0e19\u0e14\u0e27\u0e07\u0e08": 16, "\u0e1a\u0e19\u0e16\u0e19\u0e19\u0e1e\u0e23\u0e30\u0e2d\u0e32\u0e17": 19, "\u0e1a\u0e19\u0e17": 16, "\u0e1a\u0e19\u0e19": [0, 4], "\u0e1a\u0e19\u0e1a\u0e01": [0, 4], "\u0e1a\u0e19\u0e1e": 16, "\u0e1a\u0e19\u0e2a": [0, 4], "\u0e1a\u0e19\u0e2b": [0, 4], "\u0e1a\u0e19\u0e40\u0e02\u0e32\u0e04": 16, "\u0e1a\u0e19\u0e42\u0e25\u0e01\u0e43\u0e1a\u0e19": [0, 4, 16], "\u0e1a\u0e20": 21, "\u0e1a\u0e21\u0e2d\u0e1a\u0e2b\u0e21\u0e32\u0e22\u0e43\u0e2b": 21, "\u0e1a\u0e23": [7, 18], "\u0e1a\u0e23\u0e16\u0e41\u0e22": 7, "\u0e1a\u0e25": 26, "\u0e1a\u0e25\u0e23\u0e32\u0e0a\u0e18\u0e32\u0e19": 9, "\u0e1a\u0e27": 8, "\u0e1a\u0e28": [1, 24], "\u0e1a\u0e2a\u0e2d\u0e07": 26, "\u0e1a\u0e2a\u0e2d\u0e07\u0e19\u0e32\u0e2c": 26, "\u0e1a\u0e32\u0e07": 20, "\u0e1a\u0e32\u0e07\u0e01": 8, "\u0e1a\u0e32\u0e07\u0e04\u0e19\u0e16": 32, "\u0e1a\u0e32\u0e17": [20, 21, 26], "\u0e1a\u0e32\u0e23": 20, "\u0e1a\u0e40\u0e01": [26, 29], "\u0e1a\u0e40\u0e08": 26, "\u0e1a\u0e40\u0e0a": 11, "\u0e1a\u0e40\u0e14": 7, "\u0e1a\u0e40\u0e1b\u0e25": 11, "\u0e1a\u0e40\u0e25": 9, "\u0e1a\u0e40\u0e2d": 26, "\u0e1a\u0e42\u0e15": 19, "\u0e1b": [1, 2, 17, 18, 20, 24, 25, 26, 28], "\u0e1b223": 17, "\u0e1b3e54": 17, "\u0e1b775300": 17, "\u0e1b\u0e01\u0e15": 21, "\u0e1b\u0e04\u0e27\u0e32\u0e21\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22\u0e42\u0e14\u0e22\u0e43\u0e0a": 9, "\u0e1b\u0e14": 20, "\u0e1b\u0e17\u0e32": 14, "\u0e1b\u0e23": [11, 18], "\u0e1b\u0e23\u0e30\u0e01\u0e32\u0e23\u0e2b\u0e19": 9, "\u0e1b\u0e23\u0e30\u0e08\u0e33\u0e43\u0e19\u0e23\u0e30\u0e14": 21, "\u0e1b\u0e23\u0e30\u0e17": 19, "\u0e1b\u0e23\u0e30\u0e18\u0e32\u0e19\u0e01\u0e23\u0e23\u0e21\u0e01\u0e32\u0e23": [2, 28], "\u0e1b\u0e23\u0e30\u0e18\u0e32\u0e19\u0e32\u0e18": 20, "\u0e1b\u0e23\u0e30\u0e21\u0e32\u0e13": 20, "\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28": [2, 28], "\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e08": [2, 28], "\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e0d": [2, 28], "\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22": [2, 16, 28], "\u0e1b\u0e23\u0e32\u0e08": 20, "\u0e1b\u0e25\u0e32": 25, "\u0e1b\u0e32\u0e01\u0e41\u0e21": [2, 28], "\u0e1c": [2, 9, 20, 21, 28, 32], "\u0e1c\u0e21": [8, 15, 20, 32], "\u0e1c\u0e21\u0e23": [8, 23], "\u0e1c\u0e21\u0e2a\u0e2d\u0e19": [0, 4], "\u0e1c\u0e21\u0e40\u0e02": [0, 4], "\u0e1c\u0e21\u0e40\u0e1b": 15, "\u0e1c\u0e21\u0e40\u0e23": [0, 4], "\u0e1c\u0e25": [11, 32], "\u0e1d": [2, 20, 28], "\u0e1d\u0e23": [2, 28], "\u0e1d\u0e32": 21, "\u0e1e": [1, 2, 9, 15, 17, 19, 20, 21, 24, 26, 28, 32], "\u0e1e\u0e07\u0e29": 20, "\u0e1e\u0e0d\u0e32\u0e40\u0e08": 19, "\u0e1e\u0e17": [1, 24], "\u0e1e\u0e19": [2, 28], "\u0e1e\u0e19\u0e18": 17, "\u0e1e\u0e21": [2, 28], "\u0e1e\u0e23": [2, 26, 28], "\u0e1e\u0e23\u0e30\u0e2d\u0e07\u0e04": 19, "\u0e1e\u0e23\u0e30\u0e40\u0e08": [9, 19], "\u0e1e\u0e24": 26, "\u0e1e\u0e24\u0e28\u0e08": 26, "\u0e1e\u0e25\u0e40\u0e2d\u0e01": 20, "\u0e1e\u0e27\u0e01": 20, "\u0e1e\u0e27\u0e01\u0e40\u0e23\u0e32": [0, 4, 21], "\u0e1e\u0e27\u0e01\u0e40\u0e23\u0e32\u0e23": 21, "\u0e1e\u0e2d\u0e14": 20, "\u0e1e\u0e30": [1, 24], "\u0e1e\u0e32\u0e01": 32, "\u0e1e\u0e32\u0e19": 16, "\u0e1e\u0e32\u0e19\u0e08\u0e32\u0e01\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22": 16, "\u0e1e\u0e32\u0e1a": [1, 24], "\u0e1f": [26, 32], "\u0e1f\u0e23\u0e2a\u0e1f": 26, "\u0e1f\u0e40\u0e27\u0e2d\u0e23": 11, "\u0e20": [2, 9, 20, 28], "\u0e20\u0e32": 32, "\u0e20\u0e32\u0e04": 21, "\u0e20\u0e32\u0e1e\u0e22\u0e19\u0e15\u0e23": [1, 24], "\u0e20\u0e32\u0e1e\u0e27\u0e32\u0e14": 21, "\u0e20\u0e32\u0e27\u0e19\u0e32\u0e21\u0e22\u0e1b": 25, "\u0e20\u0e32\u0e29\u0e32": [21, 33], "\u0e20\u0e32\u0e29\u0e32\u0e08": [1, 24], "\u0e20\u0e32\u0e29\u0e32\u0e0d": [1, 24], "\u0e20\u0e32\u0e29\u0e32\u0e16": 33, "\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22\u0e1b\u0e23\u0e32\u0e01\u0e0f\u0e04\u0e23": 22, "\u0e21": [7, 8, 9, 11, 12, 17, 18, 19, 20, 21, 26, 32], "\u0e21\u0e01\u0e23\u0e32\u0e04\u0e21": 26, "\u0e21\u0e01\u0e32\u0e23\u0e40\u0e1b\u0e25": 19, "\u0e21\u0e0a\u0e19\u0e42\u0e04\u0e01\u0e22\u0e32\u0e27\u0e2b\u0e25\u0e32\u0e22\u0e04\u0e19\u0e44\u0e14": 32, "\u0e21\u0e19": [2, 28], "\u0e21\u0e1b\u0e17\u0e32": 14, "\u0e21\u0e20\u0e32\u0e1e": 26, "\u0e21\u0e22": 26, "\u0e21\u0e2b\u0e32\u0e27": [9, 17], "\u0e21\u0e2d": 26, "\u0e21\u0e30\u0e19\u0e32\u0e27": 18, "\u0e21\u0e30\u0e21": 14, "\u0e21\u0e32": [20, 26], "\u0e21\u0e32\u0e01": [7, 20, 26], "\u0e21\u0e32\u0e01\u0e01": 7, "\u0e21\u0e32\u0e08\u0e32\u0e01": 8, "\u0e21\u0e32\u0e14": [1, 24], "\u0e21\u0e40\u0e02": 19, "\u0e21\u0e40\u0e1b": 19, "\u0e21\u0e40\u0e21": 21, "\u0e21\u0e41\u0e21": [2, 28], "\u0e21\u0e41\u0e23\u0e01": 21, "\u0e21\u0e41\u0e23\u0e01\u0e02\u0e2d\u0e07": 21, "\u0e21\u0e41\u0e23\u0e01\u0e41\u0e25": 8, "\u0e21\u0e41\u0e2d\u0e25\u0e01\u0e2d\u0e2e\u0e2d\u0e25": 11, "\u0e22": [6, 9, 11, 17, 18, 19, 20, 21, 25, 26, 32], "\u0e22\u0e01\u0e2d\u0e32\u0e04\u0e32\u0e23\u0e2b\u0e25": 19, "\u0e22\u0e01\u0e44": 18, "\u0e22\u0e07": [2, 9, 17, 20, 26, 28], "\u0e22\u0e07\u0e1a": 9, "\u0e22\u0e07\u0e1e\u0e2d": 11, "\u0e22\u0e07\u0e20\u0e32\u0e29\u0e32\u0e08": [1, 24], "\u0e22\u0e07\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22\u0e40\u0e1b": [1, 24], "\u0e22\u0e07\u0e2a\u0e15\u0e32\u0e07\u0e04": 15, "\u0e22\u0e07\u0e2b\u0e21\u0e32": 12, "\u0e22\u0e07\u0e2d": 17, "\u0e22\u0e07\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e17": 11, "\u0e22\u0e07\u0e40\u0e2b\u0e21": 17, "\u0e22\u0e0a": 32, "\u0e22\u0e13\u0e23\u0e07\u0e04": 19, "\u0e22\u0e14\u0e15\u0e32\u0e21\u0e19": 7, "\u0e22\u0e14\u0e19\u0e32\u0e21": [2, 28], "\u0e22\u0e17": 21, "\u0e22\u0e18\u0e23\u0e23\u0e21\u0e28\u0e32\u0e2a\u0e15\u0e23": 9, "\u0e22\u0e19": [0, 1, 4, 9, 24], "\u0e22\u0e19\u0e02\u0e2d\u0e07\u0e40\u0e23\u0e32\u0e19": 26, "\u0e22\u0e19\u0e1e\u0e24\u0e15": 11, "\u0e22\u0e19\u0e2a": 21, "\u0e22\u0e19\u0e2b\u0e19": [0, 4], "\u0e22\u0e19\u0e40\u0e1b": 21, "\u0e22\u0e19\u0e41\u0e1b\u0e25\u0e07": 19, "\u0e22\u0e19\u0e41\u0e1b\u0e25\u0e07\u0e15": 19, "\u0e22\u0e1a\u0e17": 19, "\u0e22\u0e1e\u0e23\u0e30\u0e1a\u0e32\u0e17\u0e2a\u0e21\u0e40\u0e14": 19, "\u0e22\u0e23": 12, "\u0e22\u0e27": 20, "\u0e22\u0e27\u0e30\u0e15": 19, "\u0e22\u0e27\u0e40\u0e02": [2, 28], "\u0e22\u0e30": 19, "\u0e22\u0e32\u0e20\u0e23\u0e13": 14, "\u0e22\u0e43\u0e19\u0e04\u0e27\u0e32\u0e21\u0e1a\u0e01\u0e1e\u0e23": 21, "\u0e22\u0e44\u0e1e\u0e1a": 20, "\u0e23": [0, 2, 4, 6, 8, 9, 11, 17, 18, 19, 20, 21, 28, 32, 33], "\u0e23100": 17, "\u0e231000": 17, "\u0e23100000": 17, "\u0e23\u0e13\u0e01\u0e32\u0e23": 17, "\u0e23\u0e14": 17, "\u0e23\u0e16": 17, "\u0e23\u0e16\u0e44\u0e1f": [2, 28], "\u0e23\u0e16\u0e44\u0e1f\u0e1f": [2, 28], "\u0e23\u0e21": 17, "\u0e23\u0e23": [0, 3, 4, 26], "\u0e23\u0e2a": 17, "\u0e23\u0e2d\u0e07\u0e19\u0e32\u0e22\u0e01\u0e23": [2, 28], "\u0e23\u0e2d\u0e07\u0e1b\u0e23\u0e30\u0e18\u0e32\u0e19": [2, 28], "\u0e23\u0e2d\u0e14": 20, "\u0e23\u0e30\u0e0a": 17, "\u0e23\u0e30\u0e1a": 8, "\u0e23\u0e30\u0e1a\u0e1a": 20, "\u0e23\u0e32": [1, 21, 24], "\u0e23\u0e32\u0e0a\u0e1a": [1, 24], "\u0e23\u0e32\u0e0a\u0e27\u0e07\u0e28": 21, "\u0e23\u0e32\u0e22\u0e25\u0e30\u0e40\u0e2d": 7, "\u0e25": [2, 9, 10, 17, 20, 21, 28], "\u0e25100": 17, "\u0e25\u0e02\u0e2d\u0e07\u0e1c": 29, "\u0e25\u0e04\u0e04\u0e19\u0e40\u0e01": 12, "\u0e25\u0e04\u0e0a\u0e40\u0e2a\u0e19": 19, "\u0e25\u0e08\u0e2d\u0e21\u0e40\u0e01\u0e25": 19, "\u0e25\u0e14\u0e19": 11, "\u0e25\u0e1b\u0e27": 19, "\u0e25\u0e21": 33, "\u0e25\u0e22": 20, "\u0e25\u0e2d\u0e07\u0e41\u0e25": 7, "\u0e25\u0e30": 20, "\u0e25\u0e32\u0e22\u0e25": 26, "\u0e25\u0e32\u0e27": [2, 9, 28], "\u0e25\u0e33\u0e19": [2, 28], "\u0e27": [0, 2, 4, 8, 12, 16, 17, 18, 19, 20, 21, 25, 26, 28, 32], "\u0e27330000": 32, "\u0e274000": 32, "\u0e27\u0e01": 12, "\u0e27\u0e02": 26, "\u0e27\u0e07": 14, "\u0e27\u0e17": 17, "\u0e27\u0e19": [9, 19, 20, 26], "\u0e27\u0e19\u0e17": 19, "\u0e27\u0e19\u0e40\u0e2a": [2, 28], "\u0e27\u0e1a\u0e17": 9, "\u0e27\u0e1e\u0e27\u0e01\u0e21": 7, "\u0e27\u0e22\u0e17\u0e33\u0e43\u0e2b": 19, "\u0e27\u0e22\u0e18\u0e07": 9, "\u0e27\u0e22\u0e2d": [1, 24], "\u0e27\u0e23": 18, "\u0e27\u0e23\u0e23\u0e13": [17, 20, 32], "\u0e27\u0e23\u0e23\u0e13\u0e01\u0e23\u0e23\u0e21": 21, "\u0e27\u0e23\u0e2a\u0e19": 7, "\u0e27\u0e25\u0e30\u0e01": 19, "\u0e27\u0e2d": 26, "\u0e27\u0e2d\u0e1a\u0e43\u0e2b": 29, "\u0e27\u0e32": 21, "\u0e27\u0e40\u0e15\u0e2d\u0e23": 29, "\u0e27\u0e40\u0e1b": 11, "\u0e27\u0e40\u0e2d\u0e07": 20, "\u0e27\u0e42\u0e1b\u0e23\u0e14\u0e40\u0e01\u0e25": 19, "\u0e27\u0e42\u0e21\u0e07": 20, "\u0e27\u0e46": 20, "\u0e28": [1, 2, 9, 16, 19, 20, 21, 24, 26, 28], "\u0e28\u0e23": 19, "\u0e28\u0e32\u0e2a\u0e15\u0e23": [1, 24], "\u0e29": [9, 10, 19], "\u0e29\u0e10\u0e32\u0e19": 14, "\u0e29\u0e20": 9, "\u0e29\u0e41\u0e25\u0e30": 19, "\u0e29\u0e41\u0e25\u0e30\u0e44\u0e21": 19, "\u0e2a": [7, 8, 9, 11, 14, 18, 19, 20, 26, 32], "\u0e2a\u0e14": [6, 9, 11], "\u0e2a\u0e15\u0e32\u0e07\u0e04": 26, "\u0e2a\u0e19": 20, "\u0e2a\u0e19\u0e32\u0e21": 20, "\u0e2a\u0e20\u0e32": [2, 28], "\u0e2a\u0e20\u0e32\u0e1e": 32, "\u0e2a\u0e20\u0e32\u0e1e\u0e01\u0e32\u0e23\u0e08": 32, "\u0e2a\u0e21": [0, 4, 20], "\u0e2a\u0e21\u0e32\u0e0a": [2, 28], "\u0e2a\u0e22": 20, "\u0e2a\u0e23": [14, 19], "\u0e2a\u0e23\u0e23": 12, "\u0e2a\u0e23\u0e23\u0e40\u0e1e\u0e0a\u0e0d": 3, "\u0e2a\u0e23\u0e30": 3, "\u0e2a\u0e27": [6, 9, 11], "\u0e2a\u0e27\u0e22": 20, "\u0e2a\u0e27\u0e22\u0e07\u0e32\u0e21": 12, "\u0e2a\u0e2b\u0e23": [2, 28], "\u0e2a\u0e2d\u0e07": [20, 26], "\u0e2a\u0e2d\u0e07\u0e1e": 26, "\u0e2a\u0e2d\u0e07\u0e23": 26, "\u0e2a\u0e2d\u0e07\u0e25": 26, "\u0e2a\u0e2d\u0e07\u0e42\u0e21\u0e07\u0e40\u0e0a": 26, "\u0e2a\u0e32": [1, 24], "\u0e2a\u0e32\u0e21": 20, "\u0e2a\u0e32\u0e21\u0e32\u0e23\u0e16": [1, 14, 24], "\u0e2a\u0e32\u0e27": 12, "\u0e2a\u0e33\u0e19": [1, 24], "\u0e2a\u0e33\u0e2b\u0e23": 20, "\u0e2b": [1, 19, 20, 21, 24, 26], "\u0e2b\u0e01\u0e42\u0e21\u0e07\u0e04\u0e23": 26, "\u0e2b\u0e19": [1, 20, 24, 25, 26], "\u0e2b\u0e19\u0e2d\u0e07\u0e04\u0e32\u0e22": 20, "\u0e2b\u0e21": [21, 26, 32], "\u0e2b\u0e21\u0e2d": [2, 28], "\u0e2b\u0e21\u0e32": 12, "\u0e2b\u0e21\u0e32\u0e0a": 12, "\u0e2b\u0e21\u0e32\u0e22\u0e16": 19, "\u0e2b\u0e23": [19, 20], "\u0e2b\u0e25": [1, 17, 24], "\u0e2b\u0e25\u0e1a\u0e20": 20, "\u0e2b\u0e25\u0e32\u0e22": 20, "\u0e2b\u0e25\u0e32\u0e22\u0e1b": 32, "\u0e2b\u0e27\u0e32\u0e19": 18, "\u0e2b\u0e32\u0e01": 20, "\u0e2b\u0e32\u0e23": 19, "\u0e2c\u0e32": 20, "\u0e2c\u0e32\u0e25\u0e07\u0e01\u0e23\u0e13": 17, "\u0e2d": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 14, 17, 18, 19, 20, 21, 24, 25, 26, 28, 32], "\u0e2d\u0e01\u0e1b\u0e23\u0e30\u0e42\u0e22\u0e04\u0e2a\u0e33\u0e04": 9, "\u0e2d\u0e02\u0e19\u0e21\u0e0a\u0e19": 29, "\u0e2d\u0e02\u0e2d\u0e07\u0e40\u0e2b\u0e25\u0e27": 19, "\u0e2d\u0e04": 17, "\u0e2d\u0e07": [9, 20, 32], "\u0e2d\u0e07\u0e01\u0e07": [2, 28], "\u0e2d\u0e07\u0e01\u0e32\u0e23\u0e04": 29, "\u0e2d\u0e07\u0e08": 16, "\u0e2d\u0e07\u0e08\u0e32\u0e01": 20, "\u0e2d\u0e07\u0e0a": 9, "\u0e2d\u0e07\u0e0a\u0e32\u0e27\u0e1a": 15, "\u0e2d\u0e07\u0e14": 11, "\u0e2d\u0e07\u0e17": [11, 21, 32], "\u0e2d\u0e07\u0e1f": 16, "\u0e2d\u0e07\u0e2b\u0e25\u0e27\u0e07\u0e02\u0e2d\u0e07\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22": 10, "\u0e2d\u0e07\u0e2e\u0e32\u0e40\u0e0b\u0e42\u0e22": [1, 24], "\u0e2d\u0e07\u0e40\u0e18": 12, "\u0e2d\u0e07\u0e40\u0e1e\u0e25\u0e07": 20, "\u0e2d\u0e07\u0e40\u0e2a": 17, "\u0e2d\u0e07\u0e44\u0e21": 19, "\u0e2d\u0e08": 17, "\u0e2d\u0e08\u0e33\u0e19\u0e27\u0e19\u0e17": 26, "\u0e2d\u0e14": [9, 19], "\u0e2d\u0e14\u0e01": 7, "\u0e2d\u0e14\u0e32\u0e27": [2, 28], "\u0e2d\u0e16": [7, 26], "\u0e2d\u0e18\u0e07\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\u0e22": 26, "\u0e2d\u0e19": [2, 7, 12, 20, 28], "\u0e2d\u0e19\u0e01": [12, 17, 20], "\u0e2d\u0e19\u0e1c\u0e21\u0e01": 26, "\u0e2d\u0e19\u0e41\u0e01": 19, "\u0e2d\u0e19\u0e43\u0e2b": 11, "\u0e2d\u0e1a": [20, 32], "\u0e2d\u0e1a\u0e40\u0e04\u0e22\u0e40\u0e1b": 10, "\u0e2d\u0e1b\u0e1b": [1, 24], "\u0e2d\u0e1e": 32, "\u0e2d\u0e21": [11, 29], "\u0e2d\u0e21\u0e32\u0e40\u0e23": 19, "\u0e2d\u0e21\u0e32\u0e43\u0e19\u0e23": 19, "\u0e2d\u0e21\u0e41\u0e0b\u0e21\u0e2a": 19, "\u0e2d\u0e22": [7, 19, 20, 21, 25, 26], "\u0e2d\u0e22\u0e1a\u0e32\u0e17\u0e16": 26, "\u0e2d\u0e22\u0e22\u0e22\u0e22\u0e22\u0e22\u0e22\u0e22": 25, "\u0e2d\u0e22\u0e2a": 26, "\u0e2d\u0e22\u0e40\u0e1b": 11, "\u0e2d\u0e22\u0e41\u0e1b\u0e14\u0e2a": 26, "\u0e2d\u0e22\u0e44\u0e1b": 11, "\u0e2d\u0e23": [7, 19], "\u0e2d\u0e27": [19, 21], "\u0e2d\u0e2a\u0e32\u0e22\u0e21\u0e2d\u0e0d": 19, "\u0e2d\u0e2b\u0e21": 12, "\u0e2d\u0e2d\u0e01\u0e01": 11, "\u0e2d\u0e2d\u0e01\u0e40\u0e2a": 17, "\u0e2d\u0e30": [3, 21], "\u0e2d\u0e30\u0e40\u0e1f\u0e40\u0e0b": 21, "\u0e2d\u0e30\u0e44\u0e23": 20, "\u0e2d\u0e32": [21, 26], "\u0e2d\u0e32\u0e01\u0e32\u0e23": 21, "\u0e2d\u0e32\u0e04\u0e32\u0e23": 20, "\u0e2d\u0e32\u0e04\u0e32\u0e23\u0e40\u0e23": [0, 4], "\u0e2d\u0e32\u0e20\u0e23\u0e13": 25, "\u0e2d\u0e32\u0e2b\u0e32\u0e23": [19, 20], "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e08\u0e30\u0e15": 19, "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e17": 11, "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e17\u0e2d\u0e14": 11, "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e21": 11, "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e40\u0e0a": [2, 28], "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e40\u0e17": [2, 28], "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e40\u0e22": [2, 28], "\u0e2d\u0e32\u0e2b\u0e32\u0e23\u0e44\u0e17\u0e22": [2, 28], "\u0e2d\u0e32\u0e40\u0e1a\u0e30": 21, "\u0e2d\u0e40\u0e21\u0e23": [2, 9, 28], "\u0e2d\u0e41\u0e1a\u0e15\u0e2b\u0e21\u0e14": 7, "\u0e2d\u0e41\u0e21": 32, "\u0e2e": [2, 28], "\u0e2f\u0e25\u0e2f": 19, "\u0e30": [16, 20], "\u0e32": [1, 2, 9, 16, 19, 20, 21, 24, 25, 26, 28, 32], "\u0e32\u0e01": 20, "\u0e32\u0e01\u0e24\u0e14\u0e32\u0e20": 19, "\u0e32\u0e01\u0e32\u0e23": [2, 28], "\u0e32\u0e04": 11, "\u0e32\u0e04\u0e23": 16, "\u0e32\u0e07": [19, 20, 32], "\u0e32\u0e07\u0e01": 32, "\u0e32\u0e07\u0e01\u0e27": 21, "\u0e32\u0e07\u0e01\u0e32\u0e22": 19, "\u0e32\u0e07\u0e01\u0e32\u0e22\u0e40\u0e08\u0e23": 19, "\u0e32\u0e07\u0e01\u0e32\u0e22\u0e41\u0e25": 19, "\u0e32\u0e07\u0e02\u0e27\u0e32\u0e07\u0e41\u0e15": 21, "\u0e32\u0e07\u0e04": 11, "\u0e32\u0e07\u0e15\u0e33\u0e2b\u0e19": 19, "\u0e32\u0e07\u0e1b\u0e01\u0e15": 19, "\u0e32\u0e07\u0e21": [0, 4, 16], "\u0e32\u0e07\u0e2a\u0e21": 11, "\u0e32\u0e07\u0e41\u0e25\u0e30\u0e2a\u0e20\u0e32\u0e1e\u0e01\u0e32\u0e23\u0e17\u0e33\u0e07\u0e32\u0e19": 32, "\u0e32\u0e07\u0e43\u0e14": 21, "\u0e32\u0e07\u0e44\u0e23": 20, "\u0e32\u0e07\u0e46": [19, 20], "\u0e32\u0e08\u0e2d\u0e21\u0e21\u0e32\u0e23\u0e14\u0e32": 19, "\u0e32\u0e08\u0e2d\u0e21\u0e21\u0e32\u0e23\u0e14\u0e32\u0e01\u0e25": 19, "\u0e32\u0e0a": 19, "\u0e32\u0e15\u0e32\u0e25\u0e2a": 11, "\u0e32\u0e17\u0e2d\u0e07": 12, "\u0e32\u0e19": [9, 16, 17, 20, 25], "\u0e32\u0e19\u0e01\u0e33\u0e25": 15, "\u0e32\u0e19\u0e02\u0e2d\u0e07\u0e40\u0e08": 19, "\u0e32\u0e19\u0e19\u0e19\u0e19\u0e19": 25, "\u0e32\u0e19\u0e1a\u0e32\u0e17": 20, "\u0e32\u0e19\u0e21\u0e32": 32, "\u0e32\u0e19\u0e2a": 26, "\u0e32\u0e19\u0e2a\u0e2d\u0e07\u0e2b\u0e21": 26, "\u0e32\u0e19\u0e2a\u0e32\u0e21\u0e41\u0e2a\u0e19\u0e2b\u0e01\u0e23": 26, "\u0e32\u0e19\u0e40\u0e01": 21, "\u0e32\u0e1a\u0e23\u0e21\u0e27\u0e07\u0e28": 19, "\u0e32\u0e1a\u0e32\u0e17": 26, "\u0e32\u0e1a\u0e32\u0e17\u0e19": 26, "\u0e32\u0e1e": 19, "\u0e32\u0e1e\u0e23\u0e30\u0e22\u0e32\u0e21\u0e2b\u0e32\u0e42\u0e22\u0e18\u0e32": 19, "\u0e32\u0e1e\u0e23\u0e30\u0e22\u0e32\u0e21\u0e2b\u0e32\u0e42\u0e22\u0e18\u0e32\u0e19\u0e23\u0e32\u0e18": 19, "\u0e32\u0e21": 20, "\u0e32\u0e22": [2, 20, 26, 28], "\u0e32\u0e22\u0e17\u0e2d\u0e14\u0e40\u0e2a": [1, 24], "\u0e32\u0e22\u0e2d\u0e2d\u0e01": 32, "\u0e32\u0e22\u0e42\u0e21\u0e07\u0e04\u0e23": 26, "\u0e32\u0e23": [18, 26], "\u0e32\u0e23\u0e32\u0e0a\u0e01\u0e32\u0e23\u0e08": [2, 28], "\u0e32\u0e23\u0e32\u0e0a\u0e01\u0e32\u0e23\u0e44\u0e14": 21, "\u0e32\u0e25": 11, "\u0e32\u0e27": [1, 24], "\u0e32\u0e27\u0e2d\u0e22\u0e32\u0e01\u0e01": 26, "\u0e32\u0e2a": [19, 26], "\u0e32\u0e2b": 26, "\u0e32\u0e2b\u0e19": 11, "\u0e32\u0e2b\u0e23": 32, "\u0e32\u0e2d": 20, "\u0e32\u0e2d\u0e22": [11, 19, 20, 26], "\u0e32\u0e2d\u0e30\u0e44\u0e23": 16, "\u0e32\u0e2f": 19, "\u0e32\u0e32": 7, "\u0e32\u0e40\u0e08": 19, "\u0e32\u0e40\u0e1b": 19, "\u0e32\u0e40\u0e23": [0, 4], "\u0e32\u0e40\u0e2a\u0e21\u0e2d": 11, "\u0e32\u0e41\u0e02": 19, "\u0e32\u0e42\u0e21\u0e07\u0e40\u0e0a": [16, 20], "\u0e32\u0e43\u0e2b\u0e21": 21, "\u0e32\u0e46": 20, "\u0e33": [2, 16, 17, 25, 28], "\u0e33\u0e2b\u0e19": 11, "\u0e33\u0e2d\u0e22": 21, "\u0e33\u0e40\u0e2a\u0e21\u0e2d": 20, "\u0e33\u0e41\u0e02": 19, "\u0e40\u0e01": [19, 20, 21, 26], "\u0e40\u0e01\u0e13\u0e11": [1, 24], "\u0e40\u0e01\u0e2d\u0e27": 12, "\u0e40\u0e01\u0e32\u0e2b\u0e25": [2, 28], "\u0e40\u0e02": [8, 20, 21], "\u0e40\u0e02\u0e15\u0e1e\u0e23\u0e30\u0e19\u0e04\u0e23": 19, "\u0e40\u0e02\u0e32": [0, 2, 4, 20, 28], "\u0e40\u0e02\u0e32\u0e01\u0e33\u0e25": 29, "\u0e40\u0e02\u0e32\u0e0a": 12, "\u0e40\u0e04": 33, "\u0e40\u0e04\u0e22": 20, "\u0e40\u0e04\u0e22\u0e17\u0e33\u0e21\u0e32\u0e43\u0e19\u0e2d\u0e14": 21, "\u0e40\u0e04\u0e2d\u0e23": 32, "\u0e40\u0e07": 21, "\u0e40\u0e08": 19, "\u0e40\u0e08\u0e23": [17, 19], "\u0e40\u0e0a": [11, 19, 20], "\u0e40\u0e0a\u0e2d\u0e23": 20, "\u0e40\u0e14": [16, 18, 19], "\u0e40\u0e15": [9, 25], "\u0e40\u0e16\u0e2d\u0e30": 20, "\u0e40\u0e17": 20, "\u0e40\u0e18\u0e2d": [19, 20], "\u0e40\u0e18\u0e2d\u0e20": 7, "\u0e40\u0e19": [18, 20, 29], "\u0e40\u0e19\u0e2d\u0e23": [2, 28], "\u0e40\u0e1a\u0e30": 21, "\u0e40\u0e1b": [7, 15, 19, 20, 21, 26], "\u0e40\u0e1c": 32, "\u0e40\u0e1c\u0e0a": 20, "\u0e40\u0e1e": [9, 11, 17, 20], "\u0e40\u0e1e\u0e22\u0e19": 18, "\u0e40\u0e1e\u0e23\u0e32\u0e30\u0e27": [19, 20], "\u0e40\u0e1e\u0e25": 18, "\u0e40\u0e1e\u0e25\u0e07": 18, "\u0e40\u0e1f\u0e40\u0e0b": 21, "\u0e40\u0e21": [9, 20, 26], "\u0e40\u0e21\u0e19": [2, 28], "\u0e40\u0e23": [0, 2, 4, 9, 12, 16, 20, 21, 28], "\u0e40\u0e23\u0e32": [0, 4, 20, 21, 33], "\u0e40\u0e23\u0e32\u0e0a\u0e2d\u0e1a\u0e44\u0e1b\u0e42\u0e23\u0e07\u0e40\u0e23": [0, 4], "\u0e40\u0e23\u0e32\u0e23": 33, "\u0e40\u0e23\u0e32\u0e40\u0e25": 21, "\u0e40\u0e25": [20, 21], "\u0e40\u0e25\u0e02": 26, "\u0e40\u0e25\u0e02\u0e32\u0e18": [2, 28], "\u0e40\u0e25\u0e22": 20, "\u0e40\u0e25\u0e22\u0e04": 7, "\u0e40\u0e27": [2, 28, 29], "\u0e40\u0e27\u0e25\u0e32": [20, 26], "\u0e40\u0e27\u0e25\u0e32\u0e21": 7, "\u0e40\u0e28\u0e29": 20, "\u0e40\u0e2a": [2, 9, 18, 20, 26, 28, 32], "\u0e40\u0e2a\u0e17\u0e2d\u0e46": 20, "\u0e40\u0e2a\u0e32": 19, "\u0e40\u0e2a\u0e32\u0e44\u0e1f\u0e1f": 19, "\u0e40\u0e2b": [1, 20, 24], "\u0e40\u0e2b\u0e15": [18, 26, 32], "\u0e40\u0e2b\u0e15\u0e01\u0e32\u0e23\u0e13": 18, "\u0e40\u0e2b\u0e21": [12, 20, 26], "\u0e40\u0e2b\u0e23\u0e2d": 20, "\u0e40\u0e2d": [12, 20], "\u0e40\u0e2d\u0e01": 26, "\u0e40\u0e2d\u0e01\u0e2d": [2, 28], "\u0e40\u0e2d\u0e07": 12, "\u0e40\u0e2d\u0e19\u0e40\u0e19": 9, "\u0e40\u0e2d\u0e2d": [12, 20], "\u0e40\u0e2d\u0e32": 13, "\u0e40\u0e40\u0e1b\u0e25\u0e01": 26, "\u0e41": 21, "\u0e41\u0e01": 20, "\u0e41\u0e04\u0e19\u0e32\u0e14\u0e32": 9, "\u0e41\u0e04\u0e1b": 20, "\u0e41\u0e04\u0e25\u0e2d\u0e23": 11, "\u0e41\u0e15": [9, 20, 29], "\u0e41\u0e19": 19, "\u0e41\u0e1a\u0e19": 29, "\u0e41\u0e1a\u0e1a": [13, 20], "\u0e41\u0e1a\u0e1a\u0e08\u0e33\u0e25\u0e2d\u0e07\u0e41\u0e1a\u0e1a\u0e25\u0e33\u0e14": 9, "\u0e41\u0e1b": 21, "\u0e41\u0e1b\u0e14": 26, "\u0e41\u0e1b\u0e14\u0e19\u0e32\u0e2c": 26, "\u0e41\u0e1b\u0e14\u0e42\u0e21\u0e07\u0e2a": 26, "\u0e41\u0e1b\u0e25\u0e01": [21, 26], "\u0e41\u0e21": [2, 20, 21, 28], "\u0e41\u0e21\u0e27": [11, 12, 16, 20, 25, 26], "\u0e41\u0e21\u0e27\u0e17\u0e33\u0e2d\u0e30\u0e44\u0e23\u0e15\u0e2d\u0e19\u0e2b": [16, 20], "\u0e41\u0e21\u0e27\u0e40\u0e27\u0e25\u0e32\u0e19\u0e30\u0e19": 11, "\u0e41\u0e21\u0e27\u0e44\u0e21": 11, "\u0e41\u0e22": 21, "\u0e41\u0e22\u0e01": 21, "\u0e41\u0e23": 21, "\u0e41\u0e23\u0e07": [9, 18], "\u0e41\u0e23\u0e07\u0e07\u0e32\u0e19": 32, "\u0e41\u0e23\u0e07\u0e07\u0e32\u0e19\u0e01\u0e30\u0e14": 32, "\u0e41\u0e25": [12, 29], "\u0e41\u0e25\u0e30": [17, 20, 21, 26, 32], "\u0e41\u0e25\u0e30\u0e01\u0e32\u0e23\u0e41\u0e2a\u0e14\u0e07\u0e07": 21, "\u0e41\u0e25\u0e30\u0e14": 7, "\u0e41\u0e25\u0e30\u0e1e": 11, "\u0e41\u0e25\u0e30\u0e40\u0e02\u0e32": 8, "\u0e41\u0e25\u0e30\u0e40\u0e02\u0e32\u0e44\u0e14": 21, "\u0e41\u0e25\u0e30\u0e40\u0e04\u0e23": 11, "\u0e41\u0e25\u0e30\u0e40\u0e08": 19, "\u0e41\u0e25\u0e30\u0e40\u0e1b": 19, "\u0e41\u0e25\u0e30\u0e44\u0e14": 21, "\u0e41\u0e2a\u0e19\u0e2a\u0e14\u0e43\u0e2a": 12, "\u0e41\u0e2a\u0e19\u0e41\u0e1b\u0e14\u0e2b\u0e21": 26, "\u0e41\u0e2b\u0e25": [2, 28], "\u0e41\u0e2d": 26, "\u0e41\u0e2d\u0e1a": 18, "\u0e41\u0e2d\u0e25\u0e01\u0e2d\u0e2e\u0e2d\u0e25": 9, "\u0e41\u0e2d\u0e25\u0e08": 9, "\u0e42\u0e01_\u0e40\u0e27\u0e2a\u0e1b": 9, "\u0e42\u0e04": 20, "\u0e42\u0e04\u0e42\u0e23\u0e19": 20, "\u0e42\u0e0b": 21, "\u0e42\u0e14\u0e22\u0e19": 9, "\u0e42\u0e14\u0e22\u0e1b\u0e01\u0e15": 20, "\u0e42\u0e14\u0e22\u0e40\u0e23": 20, "\u0e42\u0e17": 26, "\u0e42\u0e19": 20, "\u0e42\u0e1b\u0e23\u0e14": 9, "\u0e42\u0e1b\u0e23\u0e41\u0e01\u0e23\u0e21\u0e01\u0e32\u0e23\u0e2a": 17, "\u0e42\u0e1b\u0e23\u0e41\u0e01\u0e23\u0e21\u0e04\u0e2d\u0e21\u0e1e": 29, "\u0e42\u0e1e\u0e18": 16, "\u0e42\u0e21": 9, "\u0e42\u0e21\u0e07": 20, "\u0e42\u0e21\u0e19": 9, "\u0e42\u0e21\u0e1a": 9, "\u0e42\u0e22\u0e04": 14, "\u0e42\u0e23\u0e04\u0e23\u0e30\u0e1a\u0e32\u0e14": 20, "\u0e42\u0e23\u0e07\u0e40\u0e23": [0, 4, 26], "\u0e42\u0e23\u0e07\u0e41\u0e23\u0e21\u0e02\u0e2d\u0e07\u0e40\u0e23\u0e32\u0e19": 26, "\u0e42\u0e23\u0e21\u0e32\u0e40\u0e19": 9, "\u0e42\u0e25\u0e01\u0e23": 20, "\u0e42\u0e25\u0e40\u0e21\u0e15\u0e23": 20, "\u0e42\u0e2d": [9, 20, 33], "\u0e42\u0e2d\u0e1a\u0e23": 10, "\u0e42\u0e2d\u0e1a\u0e32\u0e21\u0e32": 20, "\u0e42\u0e2d\u0e1a\u0e32\u0e21\u0e32\u0e40\u0e1b": 20, "\u0e42\u0e2d\u0e1e": 20, "\u0e42\u0e2d\u0e27\u0e32\u0e19\u0e19": 9, "\u0e42\u0e2d\u0e2e\u0e32\u0e42\u0e22": [1, 24], "\u0e42\u0e2d\u0e30\u0e2e\u0e30\u0e42\u0e22": [1, 24], "\u0e42\u0e2d\u0e40\u0e04": [21, 33], "\u0e42\u0e2d\u0e40\u0e04\u0e1a": [21, 33], "\u0e43\u0e01\u0e25": 26, "\u0e43\u0e04\u0e23": 20, "\u0e43\u0e08": 7, "\u0e43\u0e08\u0e01": 7, "\u0e43\u0e08\u0e17": 7, "\u0e43\u0e0a": [12, 20, 26, 29], "\u0e43\u0e15": [2, 20, 28], "\u0e43\u0e19": [20, 26, 32], "\u0e43\u0e19\u0e01\u0e32\u0e23\u0e17\u0e33\u0e07\u0e32\u0e19": 29, "\u0e43\u0e19\u0e17\u0e32\u0e07\u0e2d": 29, "\u0e43\u0e19\u0e1b": [2, 28], "\u0e43\u0e19\u0e23": [19, 26], "\u0e43\u0e19\u0e2a\u0e27\u0e19": 15, "\u0e43\u0e2b": [19, 20], "\u0e43\u0e2b\u0e21": [20, 25], "\u0e44": 21, "\u0e44\u0e01": 26, "\u0e44\u0e07": [7, 9], "\u0e44\u0e0b\u0e19": [2, 28], "\u0e44\u0e14": [6, 7, 8, 11, 12, 20, 21, 26], "\u0e44\u0e17\u0e22": [1, 2, 20, 24, 28], "\u0e44\u0e17\u0e22\u0e40\u0e2d": 26, "\u0e44\u0e17\u0e22\u0e43\u0e19\u0e40\u0e2d\u0e01\u0e2a\u0e32\u0e23": [1, 24], "\u0e44\u0e1b": [0, 4, 9, 20], "\u0e44\u0e1b\u0e22": [0, 4], "\u0e44\u0e1e\u0e1a": 20, "\u0e44\u0e1f\u0e1f": 19, "\u0e44\u0e1f\u0e25": 18, "\u0e44\u0e21": [9, 20, 21, 26, 32], "\u0e44\u0e21\u0e40\u0e04": 20, "\u0e44\u0e2b\u0e19": [13, 20, 21], "\u0e44\u0e2b\u0e21": 20, "\u0e46": [7, 19, 26, 29], "\u0e50": 26, "\u0e51": [9, 26], "\u0e51\u0e50": 26, "\u0e51\u0e52\u0e53": 26, "\u0e52": 26, "\u0e52\u0e55": 26, "\u0e53": 26, "\u0e54": 26, "\u0e54\u0e50\u0e50": 26, "\u0e55": 26, "\u0e55\u0e59": 26, "\u0e58": 26, "\u0e59": 26, "\u6211\u7231\u4f60": 23}, "titles": ["pythainlp.augment", "pythainlp.transliterate", "pythainlp.word_vector", "pythainlp.ancient", "pythainlp.augment", "pythainlp.benchmarks", "pythainlp.chat", "pythainlp.classify", "pythainlp.coref", "pythainlp.corpus", "pythainlp.el", "pythainlp.generate", "pythainlp.khavee", "pythainlp.lm", "pythainlp.morpheme", "pythainlp.parse", "pythainlp.phayathaibert", "pythainlp.soundex", "pythainlp.spell", "pythainlp.summarize", "pythainlp.tag", "pythainlp.tokenize", "pythainlp.tools", "pythainlp.translate", "pythainlp.transliterate", "pythainlp.ulmfit", "pythainlp.util", "pythainlp.wangchanberta", "pythainlp.word_vector", "pythainlp.wsd", "PyThaiNLP documentation", "FAQ", "Command Line", "Getting Started", "Installation", "License"], "titleterms": {"addit": [0, 4], "all_lemma_nam": 9, "all_synset": 9, "ancient": 3, "audio_vector": 17, "augment": [0, 4], "benchmark": 5, "bigram": 11, "bigram_word_freq": 9, "bpembaug": [0, 4], "chat": 6, "citat": 30, "class": [0, 4], "classifi": 7, "command": 32, "conceptnet": 9, "configur": 34, "coref": 8, "corefer": 8, "corpu": 9, "correct": 18, "correct_s": 18, "countri": 9, "custom_lemma": 9, "default_spell_check": 18, "definit": 9, "depend": [2, 28], "dependency_pars": 15, "document": 30, "download": 9, "edg": 9, "el": 10, "engin": [1, 19, 20, 21, 24], "entitylink": 10, "evalu": 5, "exampl": [10, 11, 12, 15], "extract": 19, "faq": [31, 34], "fasttextaug": [0, 4], "find_badword": 9, "find_synonym": 9, "function": [0, 4, 5, 8], "gen_sent": 11, "gener": 11, "get": 33, "get_corpu": 9, "get_corpus_as_i": 9, "get_corpus_db": 9, "get_corpus_db_detail": 9, "get_corpus_default_db": 9, "get_corpus_path": 9, "get_transliteration_dict": 9, "indic": 30, "instal": 34, "introduct": [0, 4, 5, 8], "keybert": 19, "keyword": 19, "khave": 12, "khaveeverifi": 12, "lang": 9, "lch_similar": 9, "lemma": 9, "lemma_from_kei": 9, "level": 21, "licens": 35, "line": 32, "lk82": 17, "lm": 13, "ltw2vaug": [0, 4], "metasound": 17, "modul": [1, 2, 3, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], "morphem": 14, "morphi": 9, "norvigspellcheck": 18, "note": 30, "notebook": 33, "oscar": 9, "packag": 30, "pars": 15, "path_similar": 9, "perceptron": 20, "phayathaibert": 16, "prayut_and_somchaip": 17, "provinc": 9, "pythainlp": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], "qualiti": 5, "refer": [1, 2, 17, 18, 20, 24, 26, 27, 28, 30], "remov": 9, "resolut": 8, "revise_newmm_default_wordset": 9, "revise_wordset": 9, "runtim": 34, "sentenc": 21, "sound": 17, "soundex": 17, "spell": 18, "spell_sent": 18, "start": 33, "subword": 21, "summar": 19, "synset": 9, "tabl": 30, "tag": 20, "tagger": 20, "textaug": [], "th_en_translit": 9, "thai2fit": 11, "thai2fitaug": [0, 4], "thai2transformersaug": [0, 4], "thai_dict": 9, "thai_family_nam": 9, "thai_female_nam": 9, "thai_male_nam": 9, "thai_neg": 9, "thai_orst_word": 9, "thai_stopword": 9, "thai_syl": 9, "thai_synonym": 9, "thai_word": 9, "thai_wsd_dict": 9, "tnc": 9, "token": [5, 21], "tool": 22, "translat": 23, "transliter": [1, 24], "trigram": 11, "trigram_word_freq": 9, "ttc": 9, "tutori": 33, "udom83": 17, "ulmfit": 25, "unigram": [11, 20], "unigram_word_freq": 9, "usag": [5, 8, 11, 15], "util": [9, 26], "wangchanberta": 27, "wangchanglm": 11, "word": 21, "word2audio": 17, "word2vecaug": [0, 4], "word_approxim": 17, "word_freq": 9, "word_vector": [2, 28], "wordnet": 9, "wordnetaug": [0, 4], "wsd": 29, "wup_similar": 9}}) \ No newline at end of file diff --git a/index.html b/index.html index a9c8c76..de58b4d 100644 --- a/index.html +++ b/index.html @@ -31,13 +31,13 @@

PyThaiNLP

-

+

PyThaiNLP Documentation

Documentation

Old Documentation @@ -56,11 +56,12 @@

PyThaiNLP Documentation

PyThaiNLP 3.0 PyThaiNLP 3.1 PyThaiNLP 4.0 + PyThaiNLP 5.0
-

© PyThaiNLP 2018 - 2024

+

© PyThaiNLP 2016 - 2025

diff --git a/js/old_docs.js b/js/old_docs.js index 8643fff..9e20ed3 100644 --- a/js/old_docs.js +++ b/js/old_docs.js @@ -1,7 +1,7 @@ var options = { - text: "Warning! You are reading an older version of this document. To read documentation for the current version, click here.!", + text: "Warning! You are reading an older version of this document. To read documentation for the current version, click here!", duration: -1, - destination: "https://pythainlp.github.io/docs/5.0/", + destination: "https://pythainlp.github.io/docs/5.1/", callback: function() { // console.log("Toast hidden"); Toastify.reposition();