Merge pull request #22 from UCREL/chinese-pos-tagset-mapping

Chinese pos tagset mapping
UCREL · Jan 10, 2022 · 854bce6 · 854bce6
2 parents 8344730 + e5e33bf
commit 854bce6
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- A mapping from the [Penn Chinese Treebank POS tagset](https://verbs.colorado.edu/chinese/posguide.3rd.ch.pdf) to USAS core POS tagset.
+- In the documentation it clarifies that we used the [Universal Dependencies Treebank](https://universaldependencies.org/u/pos/) version of the UPOS tagset rather than the original version from the [paper by Petrov et al. 2012](http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf).
+- The usage documentation, for the "How-to Tag Text", has been updated so that the Chinese example includes using POS information.
 - A `CHANGELOG` file has been added. The format of the `CHANGELOG` file will now be used for the formats of all current and future GitHub release notes. For more information on the `CHANGELOG` file format see [Keep a Changelog.](https://keepachangelog.com/en/1.0.0/)
 
 ## [v0.1.0](https://github.com/UCREL/pymusas/releases/tag/v0.1.0) - 2021-12-07

diff --git a/docs/docs/api/pos_mapper.md b/docs/docs/api/pos_mapper.md
@@ -10,9 +10,18 @@
 
 
 - __UPOS\_TO\_USAS\_CORE__ : `Dict[str, List[str]]` <br/>
-    A mapping from the [Universal Part Of Speech (UPOS) tagset](http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf)
-    to the USAS core tagset. UPOS is used by the
-    [Universal Dependencies Tree Bank.](https://universaldependencies.org/u/pos/)
+    A mapping from the Universal Part Of Speech (UPOS) tagset to the USAS core tagset. The UPOS tagset used
+    here is the same as that used by the [Universal Dependencies Treebank project](https://universaldependencies.org/u/pos/).
+    This is slightly different to the original presented in the
+    [paper by Petrov et al. 2012](http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf),
+    for this original tagset see the following [GitHub repository](https://github.com/slavpetrov/universal-pos-tags).
+
+- __PENN\_CHINESE\_TREEBANK\_TO\_USAS\_CORE__ : `Dict[str, List[str]]` <br/>
+    A mapping from the [Penn Chinese Treebank tagset](https://verbs.colorado.edu/chinese/posguide.3rd.ch.pdf)
+    to the USAS core tagset. The Penn Chinese Treebank tagset here is slightly different to the original
+    as it contains three extra tags, `X`, `URL`, and `INF`, that appear to be unique to
+    the [spaCy Chinese models](https://spacy.io/models/zh). For more information on how this mapping was
+    created, see the following [GitHub issue](https://github.com/UCREL/pymusas/issues/19).
 
 <a id="pymusas.pos_mapper.UPOS_TO_USAS_CORE"></a>
 
@@ -27,6 +36,19 @@ UPOS_TO_USAS_CORE: Dict[str, List[str]] = {
     'CCONJ': ['c ...
 ```
 
+<a id="pymusas.pos_mapper.PENN_CHINESE_TREEBANK_TO_USAS_CORE"></a>
+
+#### PENN\_CHINESE\_TREEBANK\_TO\_USAS\_CORE
+
+```python
+PENN_CHINESE_TREEBANK_TO_USAS_CORE: Dict[str, List[str]] = {
+    'AS': ['part'],
+    'DEC': ['part'],
+    'DEG': ['part'],
+    'DER': ['part'],
+    'DEV': ['pa ...
+```
+
 <a id="pymusas.pos_mapper.upos_to_usas_core"></a>
 
 ### upos\_to\_usas\_core

diff --git a/docs/docs/usage/how_to/tag_text.md b/docs/docs/usage/how_to/tag_text.md
@@ -18,7 +18,7 @@ python -m spacy download zh_core_web_sm
 Then create the tagger, in a Python script:
 
 :::note
-Currently there is not lemmatisation component in the spaCy pipeline for Chinese.
+Currently there is no lemmatisation component in the spaCy pipeline for Chinese.
 :::
 
 ``` python

diff --git a/pymusas/pos_mapper.py b/pymusas/pos_mapper.py
@@ -2,10 +2,18 @@
 # Attributes
 
 UPOS_TO_USAS_CORE: `Dict[str, List[str]]`
-    A mapping from the [Universal Part Of Speech (UPOS) tagset](http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf)
-    to the USAS core tagset. UPOS is used by the
-    [Universal Dependencies Tree Bank.](https://universaldependencies.org/u/pos/)
+    A mapping from the Universal Part Of Speech (UPOS) tagset to the USAS core tagset. The UPOS tagset used
+    here is the same as that used by the [Universal Dependencies Treebank project](https://universaldependencies.org/u/pos/).
+    This is slightly different to the original presented in the
+    [paper by Petrov et al. 2012](http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf),
+    for this original tagset see the following [GitHub repository](https://github.com/slavpetrov/universal-pos-tags).
 
+PENN_CHINESE_TREEBANK_TO_USAS_CORE: `Dict[str, List[str]]`
+    A mapping from the [Penn Chinese Treebank tagset](https://verbs.colorado.edu/chinese/posguide.3rd.ch.pdf)
+    to the USAS core tagset. The Penn Chinese Treebank tagset here is slightly different to the original
+    as it contains three extra tags, `X`, `URL`, and `INF`, that appear to be unique to
+    the [spaCy Chinese models](https://spacy.io/models/zh). For more information on how this mapping was
+    created, see the following [GitHub issue](https://github.com/UCREL/pymusas/issues/19).
 '''
 from typing import Dict, List
 
@@ -30,6 +38,45 @@
     'X': ['fw', 'xx']
 }
 
+PENN_CHINESE_TREEBANK_TO_USAS_CORE: Dict[str, List[str]] = {
+    'AS': ['part'],
+    'DEC': ['part'],
+    'DEG': ['part'],
+    'DER': ['part'],
+    'DEV': ['part'],
+    'ETC': ['part'],
+    'LC': ['part'],
+    'MSP': ['part'],
+    'SP': ['part'],
+    'BA': ['fw', 'xx'],
+    'FW': ['fw', 'xx'],
+    'IJ': ['intj'],
+    'LB': ['fw', 'xx'],
+    'ON': ['fw', 'xx'],
+    'SB': ['fw', 'xx'],
+    'X': ['fw', 'xx'],
+    'URL': ['fw', 'xx'],
+    'INF': ['fw', 'xx'],
+    'NN': ['noun'],
+    'NR': ['pnoun'],
+    'NT': ['noun'],
+    'VA': ['verb'],
+    'VC': ['verb'],
+    'VE': ['verb'],
+    'VV': ['verb'],
+    'CD': ['num'],
+    'M': ['num'],
+    'OD': ['num'],
+    'DT': ['det', 'art'],
+    'CC': ['conj'],
+    'CS': ['conj'],
+    'AD': ['adv'],
+    'JJ': ['adj'],
+    'P': ['prep'],
+    'PN': ['pron'],
+    'PU': ['punc']
+}
+
 
 def upos_to_usas_core(upos_tag: str) -> List[str]:
     '''

diff --git a/tests/test_pos_mapper.py b/tests/test_pos_mapper.py
@@ -1,4 +1,4 @@
-from pymusas.pos_mapper import upos_to_usas_core
+from pymusas.pos_mapper import PENN_CHINESE_TREEBANK_TO_USAS_CORE, upos_to_usas_core
 
 
 def test_upos_to_usas_core() -> None:
@@ -14,3 +14,47 @@ def test_upos_to_usas_core() -> None:
         assert usas_tags != []
         for usas_tag in usas_tags:
             assert usas_tag.lower() == usas_tag
+
+
+def test_penn_chinese_to_usas_core() -> None:
+    assert len(PENN_CHINESE_TREEBANK_TO_USAS_CORE) == 36
+    penn_chinese_treebank_mapping = {'VA': ['verb'],
+                                     'VC': ['verb'],
+                                     'VE': ['verb'],
+                                     'VV': ['verb'],
+                                     'NR': ['pnoun'],
+                                     'NT': ['noun'],
+                                     'NN': ['noun'],
+                                     'LC': ['part'],
+                                     'PN': ['pron'],
+                                     'DT': ['det', 'art'],
+                                     'CD': ['num'],
+                                     'OD': ['num'],
+                                     'M': ['num'],
+                                     'AD': ['adv'],
+                                     'P': ['prep'],
+                                     'CC': ['conj'],
+                                     'CS': ['conj'],
+                                     'DEC': ['part'],
+                                     'DEG': ['part'],
+                                     'DER': ['part'],
+                                     'DEV': ['part'],
+                                     'SP': ['part'],
+                                     'AS': ['part'],
+                                     'ETC': ['part'],
+                                     'MSP': ['part'],
+                                     'IJ': ['intj'],
+                                     'ON': ['fw', 'xx'],
+                                     'PU': ['punc'],
+                                     'JJ': ['adj'],
+                                     'FW': ['fw', 'xx'],
+                                     'LB': ['fw', 'xx'],
+                                     'SB': ['fw', 'xx'],
+                                     'BA': ['fw', 'xx'],
+                                     'INF': ['fw', 'xx'],
+                                     'URL': ['fw', 'xx'],
+                                     'X': ['fw', 'xx']}
+    assert 36 == len(penn_chinese_treebank_mapping)
+
+    for chinese_penn_tag, usas_core_tag in PENN_CHINESE_TREEBANK_TO_USAS_CORE.items():
+        assert penn_chinese_treebank_mapping[chinese_penn_tag] == usas_core_tag