### parsing results 

In [10]:
from stanza.utils.conll import CoNLL
import stanza

with open("zh_gsdsimp-ud-test.txt", "r", encoding="utf-8") as f:
   raw_text = f.read()

In [11]:
nlp_LTP = stanza.Pipeline(
    lang="zh-hans",
    depparse_model_path="models/UD_Chinese-GSDLTP/UD_Chinese-GSDSimp_model-bert/saved_models/depparse/zh-hans_gsdsimp_electra-large_parser.pt",
    pos_model_path="models/UD_Chinese-GSDLTP/UD_Chinese-GSDSimp_model-bert/saved_models/pos/zh-hans_gsdsimp_charlm_tagger.pt",
    tokenize_model_path="models/UD_Chinese-GSDLTP/UD_Chinese-GSDSimp_model-bert/saved_models/tokenize/zh-hans_gsdsimp_tokenizer.pt",
    processors='tokenize,pos,lemma,depparse', )

doc = nlp_LTP(raw_text)

with open("test-ltp-prediction.conllu", "w", encoding="utf-8") as out_file:
    for sent_id, sent in enumerate(doc.sentences, start=1):
        out_file.write(f"# sent_id = {sent_id}\n")
        out_file.write(f"# text = {' '.join([w.text for w in sent.words])}\n")
        for i, word in enumerate(sent.words, start=1):
            out_file.write(
                f"{i}\t{word.text}\t{word.lemma}\t{word.upos}\t{word.xpos}\t_\t"
                f"{word.head}\t{word.deprel}\t_\t_\n"
            )
        out_file.write("\n")

import sys
!{sys.executable} conll18_ud_eval.py -v  test-ltp-gold.conllu test-ltp-prediction.conllu

2025-03-28 20:47:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-28 20:47:07 INFO: Downloaded file to /Users/jungyeul/stanza_resources/resources.json
2025-03-28 20:47:09 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | models/UD_...kenizer.pt |
| pos       | models/UD_..._tagger.pt |
| lemma     | gsdsimp_nocharlm        |
| depparse  | models/UD_..._parser.pt |

2025-03-28 20:47:09 INFO: Using device: cpu
2025-03-28 20:47:09 INFO: Loading: tokenize
2025-03-28 20:47:09 INFO: Loading: pos
2025-03-28 20:47:10 INFO: Loading: lemma
2025-03-28 20:47:10 INFO: Loading: depparse
2025-03-28 20:47:12 INFO: Done loading processors!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |     91.45 |     93.55 |     92.49 |
Sentences  |     99.40 |     98.80 |     99.10 |
Words      |     91.45 |     93.55 |     92.49 |
UPOS       |     86.29 |     88.27 |     87.27 |     94.36
XPOS       |     86.85 |     88.84 |     87.83 |     94.97
UFeats     |     79.67 |     81.50 |     80.57 |     87.12
AllTags    |     74.34 |     76.04 |     75.18 |     81.28
Lemmas     |     82.69 |     84.59 |     83.63 |     90.42
UAS        |     75.28 |     77.01 |     76.13 |     82.31
LAS        |     72.06 |     73.72 |     72.88 |     78.80
CLAS       |     68.50 |     70.42 |     69.45 |     77.37
MLAS       |     51.05 |     52.49 |     51.76 |     57.67
BLEX       |     57.83 |     59.46 |     58.64 |     65.32


In [12]:
nlp_Penn = stanza.Pipeline(
    lang="zh-hans",
    depparse_model_path="models/UD_Chinese-GSDPenn/UD_Chinese-GSDSimp_model-bert/saved_models/depparse/zh-hans_gsdsimp_electra-large_parser.pt",
    pos_model_path="models/UD_Chinese-GSDPenn/UD_Chinese-GSDSimp_model-bert/saved_models/pos/zh-hans_gsdsimp_charlm_tagger.pt",
    tokenize_model_path="models/UD_Chinese-GSDPenn/UD_Chinese-GSDSimp_model-bert/saved_models/tokenize/zh-hans_gsdsimp_tokenizer.pt",
    processors='tokenize,pos,lemma,depparse', )

doc = nlp_Penn(raw_text)

with open("test-penn-prediction.conllu", "w", encoding="utf-8") as out_file:
    for sent_id, sent in enumerate(doc.sentences, start=1):
        out_file.write(f"# sent_id = {sent_id}\n")
        out_file.write(f"# text = {' '.join([w.text for w in sent.words])}\n")
        for i, word in enumerate(sent.words, start=1):
            out_file.write(
                f"{i}\t{word.text}\t{word.lemma}\t{word.upos}\t{word.xpos}\t_\t"
                f"{word.head}\t{word.deprel}\t_\t_\n"
            )
        out_file.write("\n")


import sys
!{sys.executable} conll18_ud_eval.py -v  test-penn-gold.conllu test-penn-prediction.conllu

2025-03-28 20:48:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-28 20:48:16 INFO: Downloaded file to /Users/jungyeul/stanza_resources/resources.json
2025-03-28 20:48:17 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | models/UD_...kenizer.pt |
| pos       | models/UD_..._tagger.pt |
| lemma     | gsdsimp_nocharlm        |
| depparse  | models/UD_..._parser.pt |

2025-03-28 20:48:17 INFO: Using device: cpu
2025-03-28 20:48:17 INFO: Loading: tokenize
2025-03-28 20:48:18 INFO: Loading: pos
2025-03-28 20:48:19 INFO: Loading: lemma
2025-03-28 20:48:19 INFO: Loading: depparse
2025-03-28 20:48:20 INFO: Done loading processors!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |     90.46 |     92.53 |     91.48 |
Sentences  |     99.40 |     98.80 |     99.10 |
Words      |     90.46 |     92.53 |     91.48 |
UPOS       |     84.65 |     86.58 |     85.60 |     93.57
XPOS       |     85.91 |     87.88 |     86.89 |     94.97
UFeats     |     78.84 |     80.64 |     79.73 |     87.15
AllTags    |     73.19 |     74.86 |     74.02 |     80.91
Lemmas     |     83.20 |     85.11 |     84.14 |     91.98
UAS        |     73.68 |     75.36 |     74.51 |     81.44
LAS        |     70.59 |     72.20 |     71.38 |     78.03
CLAS       |     66.39 |     68.40 |     67.38 |     76.38
MLAS       |     49.29 |     50.78 |     50.02 |     56.70
BLEX       |     57.99 |     59.74 |     58.85 |     66.71


In [13]:
nlp_PKU = stanza.Pipeline(
    lang="zh-hans",
    depparse_model_path="models/UD_Chinese-GSDPKU/UD_Chinese-GSDSimp_model-bert/saved_models/depparse/zh-hans_gsdsimp_electra-large_parser.pt",
    pos_model_path="models/UD_Chinese-GSDPKU/UD_Chinese-GSDSimp_model-bert/saved_models/pos/zh-hans_gsdsimp_charlm_tagger.pt",
    tokenize_model_path="models/UD_Chinese-GSDPKU/UD_Chinese-GSDSimp_model-bert/saved_models/tokenize/zh-hans_gsdsimp_tokenizer.pt",
    processors='tokenize,pos,lemma,depparse', )
    
doc = nlp_PKU(raw_text)

with open("test-pku-prediction.conllu", "w", encoding="utf-8") as out_file:
    for sent_id, sent in enumerate(doc.sentences, start=1):
        out_file.write(f"# sent_id = {sent_id}\n")
        out_file.write(f"# text = {' '.join([w.text for w in sent.words])}\n")
        for i, word in enumerate(sent.words, start=1):
            out_file.write(
                f"{i}\t{word.text}\t{word.lemma}\t{word.upos}\t{word.xpos}\t_\t"
                f"{word.head}\t{word.deprel}\t_\t_\n"
            )
        out_file.write("\n")


import sys
!{sys.executable} conll18_ud_eval.py -v  test-pku-gold.conllu test-pku-prediction.conllu

2025-03-28 20:49:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-28 20:49:24 INFO: Downloaded file to /Users/jungyeul/stanza_resources/resources.json
2025-03-28 20:49:25 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | models/UD_...kenizer.pt |
| pos       | models/UD_..._tagger.pt |
| lemma     | gsdsimp_nocharlm        |
| depparse  | models/UD_..._parser.pt |

2025-03-28 20:49:25 INFO: Using device: cpu
2025-03-28 20:49:25 INFO: Loading: tokenize
2025-03-28 20:49:25 INFO: Loading: pos
2025-03-28 20:49:27 INFO: Loading: lemma
2025-03-28 20:49:27 INFO: Loading: depparse
2025-03-28 20:49:29 INFO: Done loading processors!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |     89.33 |     91.54 |     90.42 |
Sentences  |     98.80 |     98.80 |     98.80 |
Words      |     89.33 |     91.54 |     90.42 |
UPOS       |     83.41 |     85.48 |     84.44 |     93.38
XPOS       |     85.02 |     87.13 |     86.06 |     95.18
UFeats     |     77.83 |     79.76 |     78.78 |     87.13
AllTags    |     72.42 |     74.21 |     73.31 |     81.07
Lemmas     |     82.29 |     84.33 |     83.30 |     92.13
UAS        |     72.32 |     74.11 |     73.21 |     80.96
LAS        |     69.32 |     71.04 |     70.17 |     77.60
CLAS       |     65.17 |     67.57 |     66.35 |     76.59
MLAS       |     48.22 |     49.99 |     49.09 |     56.67
BLEX       |     57.23 |     59.33 |     58.26 |     67.25


In [15]:
nlp_GSD = stanza.Pipeline(
    lang="zh-hans",
    # download_method=DownloadMethod.REUSE_RESOURCES,
    depparse_model_path="models/UD_Chinese-GSDSimp/UD_Chinese-GSDSimp_model-bert/saved_models/depparse/zh-hans_gsdsimp_electra-large_parser.pt",
    pos_model_path="models/UD_Chinese-GSDSimp/UD_Chinese-GSDSimp_model-bert/saved_models/pos/zh-hans_gsdsimp_charlm_tagger.pt",
    tokenize_model_path="models/UD_Chinese-GSDSimp/UD_Chinese-GSDSimp_model-bert/saved_models/tokenize/zh-hans_gsdsimp_tokenizer.pt",
    processors='tokenize,pos,lemma,depparse', ) 

doc = nlp_GSD(raw_text)

with open("test-gsd-prediction.conllu", "w", encoding="utf-8") as out_file:
    for sent_id, sent in enumerate(doc.sentences, start=1):
        out_file.write(f"# sent_id = {sent_id}\n")
        out_file.write(f"# text = {' '.join([w.text for w in sent.words])}\n")
        for i, word in enumerate(sent.words, start=1):
            out_file.write(
                f"{i}\t{word.text}\t{word.lemma}\t{word.upos}\t{word.xpos}\t_\t"
                f"{word.head}\t{word.deprel}\t_\t_\n"
            )
        out_file.write("\n")


import sys
!{sys.executable} conll18_ud_eval.py -v  test-gsd-gold.conllu test-gsd-prediction.conllu

2025-03-28 20:51:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-28 20:51:31 INFO: Downloaded file to /Users/jungyeul/stanza_resources/resources.json
2025-03-28 20:51:32 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | models/UD_...kenizer.pt |
| pos       | models/UD_..._tagger.pt |
| lemma     | gsdsimp_nocharlm        |
| depparse  | models/UD_..._parser.pt |

2025-03-28 20:51:32 INFO: Using device: cpu
2025-03-28 20:51:32 INFO: Loading: tokenize
2025-03-28 20:51:32 INFO: Loading: pos
2025-03-28 20:51:33 INFO: Loading: lemma
2025-03-28 20:51:34 INFO: Loading: depparse
2025-03-28 20:51:35 INFO: Done loading processors!
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |     93.47 |     94.51 |     93.99 |
Sentences  |     99.40 |     98.80 |     99.10 |
Words      |     93.47 |     94.51 |     93.99 |
UPOS       |     90.05 |     91.04 |     90.54 |     96.34
XPOS       |     89.87 |     90.87 |     90.37 |     96.15
UFeats     |     82.73 |     83.64 |     83.18 |     88.50
AllTags    |     79.06 |     79.94 |     79.50 |     84.58
Lemmas     |     92.97 |     94.00 |     93.48 |     99.46
UAS        |     79.61 |     80.49 |     80.05 |     85.17
LAS        |     76.69 |     77.54 |     77.11 |     82.05
CLAS       |     74.29 |     75.29 |     74.79 |     81.43
MLAS       |     57.87 |     58.65 |     58.26 |     63.44
BLEX       |     74.01 |     75.01 |     74.51 |     81.13


### segmentation reproducibility 

In [1]:
from jp_algorithm import load_txt_file
from jp_algorithm import evaluate as txt_evaluate

In [3]:
def do_nothing():
    return  0

corpus = ['gsd', 'ltp', 'penn', 'pku']

gold = 'gold'
predict = 'prediction'

for i in corpus:

    print('Corpus:\t' + i)

    conllu = "test-" + i + "-" + predict + ".conllu"

    with open (conllu, "r") as file_in:
        data = file_in.read().splitlines()

    predict_file = "test-" + i + "-" + predict + ".conllu.wb.txt"
    f = open(predict_file, "w")

    sentence = []

    for line in data:
        line = line.strip()
        if line.startswith('#'):
            do_nothing()
        elif len(line) == 0:
            do_nothing()
            if len(sentence) > 0:
                f.write(' '.join(sentence) + '\n')
            sentence = []
        else:
            line = line.split('\t')
            sentence.append(line[1])
    f.close()
    
    gold_file = "test-" + i + "-" + gold + ".conllu.wb.txt"


    gold_txt = load_txt_file(gold_file)
    sys_txt  = load_txt_file(predict_file)

    evaluation = txt_evaluate(gold_txt, sys_txt)
    for metric in["Tokens"]:
        # print("{:11}|{:10} & {:10} & {:10} | {:10}".format(
        #     metric,
        #     evaluation[metric].correct,
        #     evaluation[metric].system_total - evaluation[metric].correct,
        #     evaluation[metric].gold_total - evaluation[metric].correct,
        #     evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "")
        # ))

        precision = evaluation[metric].correct / evaluation[metric].system_total 
        recall = evaluation[metric].correct / evaluation[metric].gold_total 
        f_score = 2 * (precision * recall) / (precision + recall)
        print('F score = ', f_score)

    print('-----\n')

Corpus:	gsd
F score =  0.9398518027900815
-----

Corpus:	ltp
F score =  0.9251036388760939
-----

Corpus:	penn
F score =  0.8903423699760926
-----

Corpus:	pku
F score =  0.8689556637049606
-----

