Skip to content

Commit

Permalink
Merge pull request #802 from PyThaiNLP/add-coref
Browse files Browse the repository at this point in the history
Add pythainlp.coref
  • Loading branch information
wannaphong committed Jun 5, 2023
2 parents 33c5b5a + f58d51a commit 3600236
Show file tree
Hide file tree
Showing 12 changed files with 175 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/macos-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
pip install pytest coverage coveralls
conda install -c conda-forge icu
conda install -c conda-forge pyicu
if [ -f docker_requirements.txt ]; then pip install -r docker_requirements.txt; fi
if [ -f docker_requirements.txt ]; then SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt; fi
pip install deepcut tltk
pip install .[full]
python -m nltk.downloader omw-1.4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install deepcut tltk
pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt
pip install pythainlp[full]
python -m nltk.downloader omw-1.4
- name: Test
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest coverage coveralls
if [ -f docker_requirements.txt ]; then pip install -r docker_requirements.txt; fi
if [ -f docker_requirements.txt ]; then SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt; fi
pip install deepcut tltk
pip install .[full]
python -m nltk.downloader omw-1.4
Expand Down
7 changes: 4 additions & 3 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ epitran==1.9
sacremoses==0.0.41
sentencepiece==0.1.91
ssg==0.0.8
torch==1.8.1
torch==1.13.1
fastai==1.0.61
transformers==4.22.1
phunspell==0.1.6
Expand All @@ -24,13 +24,14 @@ deepcut==0.7.0.0
h5py==3.1.0
tensorflow==2.9.3
pandas==1.4.*
tltk==1.3.8
tltk==1.6.8
OSKut==1.3
nlpo3==1.2.6
thai-nner==0.3
spacy==2.3.*
spacy==3.5.*
wunsen==0.0.3
khanaa==0.0.6
spacy_thai==0.7.1
esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
fastcoref==2.1.6
10 changes: 10 additions & 0 deletions docs/api/coref.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. currentmodule:: pythainlp.coref

pythainlp.coref
===============
The :class:`pythainlp.coref` is Coreference Resolution for Thai.

Modules
-------

.. autofunction:: coreference_resolution
1 change: 1 addition & 0 deletions docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ where ``extras`` can be
- ``esupar`` (to support esupar engine)
- ``transformers_ud`` (to support transformers_ud engine)
- ``dependency_parsing`` (to support dependency parsing with all engine)
- ``coreference_resolution`` (to support coreference esolution with all engine)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
19 changes: 19 additions & 0 deletions pythainlp/coref/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
PyThaiNLP Coreference Resolution
"""
__all__ = ["coreference_resolution"]
from pythainlp.coref.core import coreference_resolution
38 changes: 38 additions & 0 deletions pythainlp/coref/_fastcoref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import spacy


class FastCoref:
def __init__(self, model_name, nlp=spacy.blank("th"), device:str="cpu", type:str="FCoref") -> None:
if type == "FCoref":
from fastcoref import FCoref as _model
else:
from fastcoref import LingMessCoref as _model
self.model_name = model_name
self.nlp = nlp
self.model = _model(self.model_name,device=device,nlp=self.nlp)

def _to_json(self, _predict):
return {
"text":_predict.text,
"clusters_string":_predict.get_clusters(as_strings=True),
"clusters":_predict.get_clusters(as_strings=False)
}


def predict(self, texts:List[str])->dict:
return [self._to_json(i) for i in self.model.predict(texts=texts)]
55 changes: 55 additions & 0 deletions pythainlp/coref/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
model = None


def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", device:str="cpu"):
"""
Coreference Resolution
:param List[str] texts: list texts to do coreference resolution
:param str model_name: coreference resolution model
:param str device: device for running coreference resolution model (cpu, cuda, and other)
:return: List txets of coreference resolution
:rtype: List[dict]
:Options for model_name:
* *han-coref-v1.0* - (default) Han-Corf: Thai oreference resolution by PyThaiNLP v1.0
:Example:
::
from pythainlp.coref import coreference_resolution
print(
coreference_resolution(
["Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"]
)
)
# output:
# [
# {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
# 'clusters_string': [['Bill Gates', 'ผม']],
# 'clusters': [[(0, 10), (50, 52)]]}
# ]
"""
global model
if isinstance(texts, str):
texts = [texts]
if model == None and model_name=="han-coref-v1.0":
from pythainlp.coref.han_coref import HanCoref
model = HanCoref(device=device)
return model.predict(texts)
25 changes: 25 additions & 0 deletions pythainlp/coref/han_coref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pythainlp.coref._fastcoref import FastCoref
import spacy


class HanCoref(FastCoref):
def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None:
super(self.__class__, self).__init__(
model_name="pythainlp/han-coref-v1.0",
device=device,
nlp=nlp
)
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
"ufal.chu-liu-edmonds>=1.0.2",
"transformers>=4.22.1",
],
"coreference_resolution":{
"spacy>=3.0",
"fastcoref>=2.1.5",
},
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand Down Expand Up @@ -137,6 +141,8 @@
"thai_nner",
"wunsen>=0.0.3",
"spacy_thai>=0.7.1",
"spacy>=3.0",
"fastcoref>=2.1.5",
"ufal.chu-liu-edmonds>=1.0.2",
],
}
Expand Down
14 changes: 14 additions & 0 deletions tests/test_coref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

import unittest
from pythainlp.coref import coreference_resolution


class TestParsePackage(unittest.TestCase):
def test_coreference_resolution(self):
pass
# self.assertIsNotNone(
# coreference_resolution(
# "Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"
# )
# )

0 comments on commit 3600236

Please sign in to comment.