Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ Modules
TNC
---

.. autofunction:: pythainlp.corpus.tnc.word_freq
.. autofunction:: pythainlp.corpus.tnc.word_freqs

TTC
Expand Down Expand Up @@ -51,4 +50,4 @@ Definition
++++++++++

Synset
a set of synonyms that share a common meaning.
a set of synonyms that share a common meaning.
52 changes: 1 addition & 51 deletions pythainlp/corpus/tnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,14 @@
Credit: Korakot Chaovavanich‎
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
"""
import re
from typing import List, Tuple

import requests
from pythainlp.corpus import get_corpus

__all__ = ["word_freq", "word_freqs"]
__all__ = ["word_freqs"]

_FILENAME = "tnc_freq.txt"


def word_freq(word: str, domain: str = "all") -> int:
"""

.. note::
**Not officially supported.**
Get word frequency of a word by domain.
This function will make a query to the server of
Thai National Corpus.
Internet connection is required.

.. warning::
Currently (as of 29 April 2019) it is likely to return 0,
regardless of the word, as the service URL has been changed
and the code is not updated yet.
New URL is http://www.arts.chula.ac.th/~ling/tnc3/

:param string word: word
:param string domain: domain
"""
listdomain = {
"all": "",
"imaginative": "1",
"natural-pure-science": "2",
"applied-science": "3",
"social-science": "4",
"world-affairs-history": "5",
"commerce-finance": "6",
"arts": "7",
"belief-thought": "8",
"leisure": "9",
"others": "0",
}
url = "http://www.arts.chula.ac.th/~ling/tnc3/"
data = {"genre[]": "", "domain[]": listdomain[domain], "sortby": "perc", "p": word}

r = requests.post(url, data=data)

pat = re.compile(r'TOTAL</font>.*?#ffffff">(.*?)</font>', flags=re.DOTALL)
match = pat.search(r.text)

n = 0
if match:
n = int(match.group(1).strip())

return n


def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from Thai National Corpus (TNC)
Expand Down
1 change: 0 additions & 1 deletion tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def test_corpus(self):

def test_tnc(self):
self.assertIsNotNone(tnc.word_freqs())
self.assertIsNotNone(tnc.word_freq("นก"))

def test_ttc(self):
self.assertIsNotNone(ttc.word_freqs())
Expand Down