-
Notifications
You must be signed in to change notification settings - Fork 271
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #706 from PyThaiNLP/add-dependency-parser
Add pythainlp.parse.dependency_parsing
- Loading branch information
Showing
10 changed files
with
268 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
.. currentmodule:: pythainlp.parse | ||
|
||
pythainlp.parse | ||
=============== | ||
The :class:`pythainlp.parse` is dependency parsing for Thai. | ||
|
||
Modules | ||
------- | ||
|
||
.. autofunction:: dependency_parsing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
PyThaiNLP Parse | ||
""" | ||
__all__ = [ | ||
"dependency_parsing" | ||
] | ||
from pythainlp.parse.core import dependency_parsing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# -*- coding: utf-8 -*- | ||
_tagger = None | ||
_tagger_name = "" | ||
|
||
def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str: | ||
""" | ||
Dependency Parsing | ||
:param str text: text to do dependency parsing | ||
:param str model: model for using with engine \ | ||
(for esupar and transformers_ud) | ||
:param str engine: the name dependency parser | ||
:return: str (conllu) | ||
**Options for engine** | ||
* *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \ | ||
with BERT/RoBERTa/DeBERTa model. `GitHub \ | ||
<https://github.com/KoichiYasuoka/esupar>`_ | ||
* *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \ | ||
for Thai language, working on Universal Dependencies. \ | ||
`GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_ | ||
* *transformers_ud* - TransformersUD \ | ||
`GitHub <https://github.com/KoichiYasuoka/>`_ | ||
**Options for model (esupar engine)** | ||
* *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \ | ||
`Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_ | ||
* *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \ | ||
pre-trained on Thai Wikipedia texts for POS-tagging and \ | ||
dependency-parsing `Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_ | ||
* *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \ | ||
pre-trained on Thai Wikipedia texts for POS-tagging and \ | ||
dependency-parsing. (syllable level) `Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_ | ||
* *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \ | ||
pre-trained on Thai Wikipedia texts for POS-tagging \ | ||
and dependency-parsing. (char level) `Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_ | ||
If you want to train model for esupar, you can read \ | ||
`Huggingface <https://github.com/KoichiYasuoka/esupar>`_ | ||
**Options for model (transformers_ud engine)** | ||
* *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \ | ||
DeBERTa(V2) model pretrained on Thai Wikipedia texts \ | ||
for dependency-parsing (head-detection on Universal \ | ||
Dependencies) as question-answering, derived from \ | ||
deberta-base-thai. \ | ||
trained by th_blackboard.conll. `Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_ | ||
* *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \ | ||
roberta model pretrained on Thai Wikipedia texts \ | ||
for dependency-parsing. `Huggingface \ | ||
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_ | ||
:Example: | ||
:: | ||
from pythainlp.parse import dependency_parsing | ||
print(dependency_parsing("ผมเป็นคนดี", engine="esupar")) | ||
# output: | ||
# 1 ผม _ PRON _ _ 3 nsubj _ SpaceAfter=No | ||
# 2 เป็น _ VERB _ _ 3 cop _ SpaceAfter=No | ||
# 3 คน _ NOUN _ _ 0 root _ SpaceAfter=No | ||
# 4 ดี _ VERB _ _ 3 acl _ SpaceAfter=No | ||
print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai")) | ||
# output: | ||
# 1 ผม PRON PPRS _ 2 nsubj _ SpaceAfter=No | ||
# 2 เป็น VERB VSTA _ 0 ROOT _ SpaceAfter=No | ||
# 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No | ||
""" | ||
global _tagger, _tagger_name | ||
if _tagger_name != engine: | ||
if engine == "esupar": | ||
from pythainlp.parse.esupar_engine import Parse | ||
_tagger = Parse(model=model) | ||
elif engine == "transformers_ud": | ||
from pythainlp.parse.transformers_ud import Parse | ||
_tagger = Parse(model=model) | ||
elif engine == "spacy_thai": | ||
from pythainlp.parse.spacy_thai_engine import Parse | ||
_tagger = Parse() | ||
else: | ||
raise NotImplementedError( | ||
"The engine doesn't support." | ||
) | ||
_tagger_name = engine | ||
return _tagger(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages | ||
GitHub: https://github.com/KoichiYasuoka/esupar | ||
""" | ||
import esupar | ||
|
||
|
||
class Parse: | ||
def __init__(self, model: str="th") -> None: | ||
if model == None: | ||
model = "th" | ||
self.nlp=esupar.load(model) | ||
|
||
def __call__(self, text): | ||
return self.nlp(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies. | ||
GitHub: https://github.com/KoichiYasuoka/spacy-thai | ||
""" | ||
import spacy_thai | ||
|
||
|
||
class Parse: | ||
def __init__(self, model: str="th") -> None: | ||
self.nlp=spacy_thai.load() | ||
|
||
def __call__(self, text:str)->str: | ||
doc = self.nlp(text) | ||
_text = [] | ||
for t in doc: | ||
_text.append("\t".join([str(t.i+1),t.orth_,t.lemma_,t.pos_,t.tag_,"_",str(0 if t.head==t else t.head.i+1),t.dep_,"_","_" if t.whitespace_ else "SpaceAfter=No"])) | ||
return '\n'.join(_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
TransformersUD | ||
Author: Prof. Koichi Yasuoka | ||
This tagger is provided under the terms of the apache-2.0 License. | ||
The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head | ||
GitHub: https://github.com/KoichiYasuoka | ||
""" | ||
import os | ||
import numpy | ||
import torch | ||
import ufal.chu_liu_edmonds | ||
from transformers import ( | ||
AutoTokenizer, | ||
AutoModelForQuestionAnswering, | ||
AutoModelForTokenClassification, | ||
AutoConfig, | ||
TokenClassificationPipeline | ||
) | ||
from transformers.utils import cached_file | ||
|
||
|
||
class Parse: | ||
def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None: | ||
if model == None: | ||
model = "KoichiYasuoka/deberta-base-thai-ud-head" | ||
self.tokenizer=AutoTokenizer.from_pretrained(model) | ||
self.model=AutoModelForQuestionAnswering.from_pretrained(model) | ||
x=AutoModelForTokenClassification.from_pretrained | ||
if os.path.isdir(model): | ||
d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger")) | ||
else: | ||
c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json")) | ||
d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c) | ||
s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json")) | ||
t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s) | ||
self.deprel=TokenClassificationPipeline( | ||
model=d, | ||
tokenizer=self.tokenizer, | ||
aggregation_strategy="simple" | ||
) | ||
self.tagger=TokenClassificationPipeline( | ||
model=t, | ||
tokenizer=self.tokenizer | ||
) | ||
|
||
def __call__(self, text: str)->str: | ||
w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)] | ||
z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w) | ||
r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan) | ||
v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[] | ||
for i,t in enumerate(v): | ||
q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id] | ||
c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]]) | ||
b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c] | ||
with torch.no_grad(): | ||
d=self.model( | ||
input_ids=torch.tensor([sum(x,[]) for x in c]), | ||
token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b]) | ||
) | ||
s,e=d.start_logits.tolist(),d.end_logits.tolist() | ||
for i in range(n): | ||
for j in range(n): | ||
m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1] | ||
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] | ||
if [0 for i in h if i==0]!=[0]: | ||
i=([p for s,e,p in w]+["root"]).index("root") | ||
j=i+1 if i<n else numpy.nanargmax(m[:,0]) | ||
m[0:j,0]=m[j+1:,0]=numpy.nan | ||
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] | ||
u="" | ||
for i,(s,e,p) in enumerate(w,1): | ||
p="root" if h[i]==0 else "dep" if p=="root" else p | ||
u+="\t".join( | ||
[str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"] | ||
)+"\n" | ||
return u+"\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import unittest | ||
from pythainlp.parse import dependency_parsing | ||
|
||
|
||
class TestParsePackage(unittest.TestCase): | ||
def test_dependency_parsing(self): | ||
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar")) | ||
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud")) | ||
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai")) |