Skip to content

Commit

Permalink
Merge pull request #706 from PyThaiNLP/add-dependency-parser
Browse files Browse the repository at this point in the history
Add pythainlp.parse.dependency_parsing
  • Loading branch information
wannaphong committed Sep 17, 2022
2 parents 638c28d + e9b5ffb commit e2a3404
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 4 deletions.
5 changes: 4 additions & 1 deletion docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ sentencepiece==0.1.91
ssg==0.0.8
torch==1.8.1
fastai==1.0.61
transformers==4.8.2
transformers==4.22.1
phunspell==0.1.6
spylls==0.1.5
symspellpy==6.7.6
Expand All @@ -31,3 +31,6 @@ thai-nner==0.3
spacy==2.3.*
wunsen==0.0.3
khanaa==0.0.6
spacy_thai==0.7.1
esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
10 changes: 10 additions & 0 deletions docs/api/parse.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. currentmodule:: pythainlp.parse

pythainlp.parse
===============
The :class:`pythainlp.parse` is dependency parsing for Thai.

Modules
-------

.. autofunction:: dependency_parsing
6 changes: 5 additions & 1 deletion docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ where ``extras`` can be
- ``tltk`` (to support tltk)
- ``textaugment`` (to support text augmentation)
- ``oskut`` (to support OSKUT)
- ``nlpo3`` (to support nlpo3 enging)
- ``nlpo3`` (to support nlpo3 engine)
- ``spacy_thai`` (to support spacy_thai engine)
- ``esupar`` (to support esupar engine)
- ``transformers_ud`` (to support transformers_ud engine)
- ``dependency_parsing`` (to support dependency parsing with all engine)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
8 changes: 8 additions & 0 deletions pythainlp/parse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
"""
PyThaiNLP Parse
"""
__all__ = [
"dependency_parsing"
]
from pythainlp.parse.core import dependency_parsing
92 changes: 92 additions & 0 deletions pythainlp/parse/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
_tagger = None
_tagger_name = ""

def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str:
"""
Dependency Parsing
:param str text: text to do dependency parsing
:param str model: model for using with engine \
(for esupar and transformers_ud)
:param str engine: the name dependency parser
:return: str (conllu)
**Options for engine**
* *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \
with BERT/RoBERTa/DeBERTa model. `GitHub \
<https://github.com/KoichiYasuoka/esupar>`_
* *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \
for Thai language, working on Universal Dependencies. \
`GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_
* *transformers_ud* - TransformersUD \
`GitHub <https://github.com/KoichiYasuoka/>`_
**Options for model (esupar engine)**
* *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
`Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_
* *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \
pre-trained on Thai Wikipedia texts for POS-tagging and \
dependency-parsing `Huggingface \
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_
* *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \
pre-trained on Thai Wikipedia texts for POS-tagging and \
dependency-parsing. (syllable level) `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_
* *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \
pre-trained on Thai Wikipedia texts for POS-tagging \
and dependency-parsing. (char level) `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_
If you want to train model for esupar, you can read \
`Huggingface <https://github.com/KoichiYasuoka/esupar>`_
**Options for model (transformers_ud engine)**
* *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \
DeBERTa(V2) model pretrained on Thai Wikipedia texts \
for dependency-parsing (head-detection on Universal \
Dependencies) as question-answering, derived from \
deberta-base-thai. \
trained by th_blackboard.conll. `Huggingface \
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_
* *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \
roberta model pretrained on Thai Wikipedia texts \
for dependency-parsing. `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_
:Example:
::
from pythainlp.parse import dependency_parsing
print(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
# output:
# 1 ผม _ PRON _ _ 3 nsubj _ SpaceAfter=No
# 2 เป็น _ VERB _ _ 3 cop _ SpaceAfter=No
# 3 คน _ NOUN _ _ 0 root _ SpaceAfter=No
# 4 ดี _ VERB _ _ 3 acl _ SpaceAfter=No
print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))
# output:
# 1 ผม PRON PPRS _ 2 nsubj _ SpaceAfter=No
# 2 เป็น VERB VSTA _ 0 ROOT _ SpaceAfter=No
# 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No
"""
global _tagger, _tagger_name
if _tagger_name != engine:
if engine == "esupar":
from pythainlp.parse.esupar_engine import Parse
_tagger = Parse(model=model)
elif engine == "transformers_ud":
from pythainlp.parse.transformers_ud import Parse
_tagger = Parse(model=model)
elif engine == "spacy_thai":
from pythainlp.parse.spacy_thai_engine import Parse
_tagger = Parse()
else:
raise NotImplementedError(
"The engine doesn't support."
)
_tagger_name = engine
return _tagger(text)
17 changes: 17 additions & 0 deletions pythainlp/parse/esupar_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
"""
esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages
GitHub: https://github.com/KoichiYasuoka/esupar
"""
import esupar


class Parse:
def __init__(self, model: str="th") -> None:
if model == None:
model = "th"
self.nlp=esupar.load(model)

def __call__(self, text):
return self.nlp(text)
19 changes: 19 additions & 0 deletions pythainlp/parse/spacy_thai_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
"""
spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies.
GitHub: https://github.com/KoichiYasuoka/spacy-thai
"""
import spacy_thai


class Parse:
def __init__(self, model: str="th") -> None:
self.nlp=spacy_thai.load()

def __call__(self, text:str)->str:
doc = self.nlp(text)
_text = []
for t in doc:
_text.append("\t".join([str(t.i+1),t.orth_,t.lemma_,t.pos_,t.tag_,"_",str(0 if t.head==t else t.head.i+1),t.dep_,"_","_" if t.whitespace_ else "SpaceAfter=No"]))
return '\n'.join(_text)
81 changes: 81 additions & 0 deletions pythainlp/parse/transformers_ud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
"""
TransformersUD
Author: Prof. Koichi Yasuoka
This tagger is provided under the terms of the apache-2.0 License.
The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
GitHub: https://github.com/KoichiYasuoka
"""
import os
import numpy
import torch
import ufal.chu_liu_edmonds
from transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
AutoModelForTokenClassification,
AutoConfig,
TokenClassificationPipeline
)
from transformers.utils import cached_file


class Parse:
def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None:
if model == None:
model = "KoichiYasuoka/deberta-base-thai-ud-head"
self.tokenizer=AutoTokenizer.from_pretrained(model)
self.model=AutoModelForQuestionAnswering.from_pretrained(model)
x=AutoModelForTokenClassification.from_pretrained
if os.path.isdir(model):
d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger"))
else:
c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json"))
d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c)
s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json"))
t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s)
self.deprel=TokenClassificationPipeline(
model=d,
tokenizer=self.tokenizer,
aggregation_strategy="simple"
)
self.tagger=TokenClassificationPipeline(
model=t,
tokenizer=self.tokenizer
)

def __call__(self, text: str)->str:
w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
for i,t in enumerate(v):
q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
with torch.no_grad():
d=self.model(
input_ids=torch.tensor([sum(x,[]) for x in c]),
token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b])
)
s,e=d.start_logits.tolist(),d.end_logits.tolist()
for i in range(n):
for j in range(n):
m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
if [0 for i in h if i==0]!=[0]:
i=([p for s,e,p in w]+["root"]).index("root")
j=i+1 if i<n else numpy.nanargmax(m[:,0])
m[0:j,0]=m[j+1:,0]=numpy.nan
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
u=""
for i,(s,e,p) in enumerate(w,1):
p="root" if h[i]==0 else "dep" if p=="root" else p
u+="\t".join(
[str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"]
)+"\n"
return u+"\n"
23 changes: 21 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,22 @@
"onnxruntime>=1.10.0"
],
"thai_nner": ["thai_nner"],
"esupar": [
"esupar>=1.3.8",
"numpy",
"transformers>=4.22.1",
],
"spacy_thai": ["spacy_thai>=0.7.1"],
"transformers_ud": [
"ufal.chu-liu-edmonds>=1.0.2",
"transformers>=4.22.1",
],
"dependency_parsing": [
"esupar>=1.3.8",
"spacy_thai>=0.7.1",
"ufal.chu-liu-edmonds>=1.0.2",
"transformers>=4.22.1",
],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand All @@ -98,7 +114,7 @@
"torch>=1.0.0",
"fastai<2.0",
"bpemb>=0.3.2",
"transformers>=4.6.0",
"transformers>=4.22.1",
"sefr_cut>=1.1",
"phunspell>=0.1.6",
"spylls>=0.1.5",
Expand All @@ -108,7 +124,10 @@
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.3"
"wunsen>=0.0.3",
"spacy_thai>=0.7.1",
"esupar>=1.3.8",
"ufal.chu-liu-edmonds>=1.0.2",
],
}

Expand Down
11 changes: 11 additions & 0 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

import unittest
from pythainlp.parse import dependency_parsing


class TestParsePackage(unittest.TestCase):
def test_dependency_parsing(self):
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud"))
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))

0 comments on commit e2a3404

Please sign in to comment.