# Chinese Noun and MW Script
Boxuan Li + Stefan Pophristic 
April 2025

I am making edits in this one, will merge with the original later. -Stefan

Generate a csv with all combos of classifiers and nouns in the corpus. 

Input: link to corpus

Output: 
- chinese_noun_mw.csv 
    - Noun: (string) Noun used
    - MW: (string) measure word used
    - Count_Pre: (int) Number of times a given measure word appears before a noun in the corpus. 
    - Count_Post: (int) Number of times a given measure word appears after a noun in the corpus.
- chinese_all_nouns.csv
    - Noun: (string) Noun
    - Count: (int) Number of times a given noun appears in the corpus


# Parameters

In [27]:
import os
import pandas as pd
import requests
import tarfile
import io
from collections import defaultdict
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

# For displaying Chinese characters properly
import matplotlib
matplotlib.rcParams['font.family'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']

In [3]:
# Set up some configuration variables
OUTPUT_CSV = '/Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/chinese_noun_mw.csv'
CORPUS_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-4923/ud-treebanks-v2.12.tgz"

# Load in & Format Corpus

In [4]:
"""
Finds only Chinese GSDSimp files in the specified directory.

Input: 
    - base_dir (string): Directory where the treebanks were extracted
Output: 
    - List of file paths to Chinese GSDSimp CoNLL-U files
"""
def get_chinese_ud_files(base_dir):
    gsdsimp_files = []
    
    # Walk through directories looking for GSDSimp CoNLL-U files
    for root, dirs, files in os.walk(base_dir):
        if "UD_Chinese-GSDSimp" in root:
            for file in files:
                if file.endswith(".conllu"):
                    gsdsimp_files.append(os.path.join(root, file))
    
    print(f"Found {len(gsdsimp_files)} Chinese GSDSimp files.")
    return gsdsimp_files

# Now call the function with the actual path to your extracted files
# Replace this path with where you actually extracted the files
base_directory = '/Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15'
chinese_files = get_chinese_ud_files(base_directory)

for i in chinese_files:
    print(i)

Found 3 Chinese GSDSimp files.
/Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu
/Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu
/Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu


In [5]:
# ## 3. Parsing CoNLL-U Format
# 
# The corpus is in CoNLL-U format, which contains 10 columns of linguistic information.
# We'll parse this format into a more structured representation for analysis.

# %%
def parse_conllu_sentences(file_path):
    """
    Parses a CoNLL-U format file into structured sentence objects with dependency information.
    
    Input: file_path (string) - Path to a CoNLL-U format file
    Output: List of sentence dictionaries, where each dictionary contains:
            - 'id': sentence ID
            - 'tokens': list of token dictionaries with linguistic annotations
            - 'token_dict': dictionary mapping token IDs to token objects for easy lookup
    """
    sentences = []
    current_sentence = []
    sent_id = None
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            if line.startswith('# sent_id ='):
                sent_id = line.split('=')[1].strip()
            elif line.startswith('#'):
                continue
            elif not line:
                if current_sentence:
                    # Build token lookup by ID for easy reference
                    token_dict = {token['id']: token for token in current_sentence}
                    
                    # Link tokens based on dependencies
                    for token in current_sentence:
                        token['head_token'] = token_dict.get(token['head'], None)
                    
                    sentences.append({
                        'id': sent_id,
                        'tokens': current_sentence,
                        'token_dict': token_dict
                    })
                    current_sentence = []
                    sent_id = None
                continue
            
            fields = line.split('\t')
            if len(fields) == 10 and '-' not in fields[0]:
                token = {
                    'id': fields[0],
                    'form': fields[1],
                    'lemma': fields[2],
                    'upos': fields[3],
                    'xpos': fields[4],
                    'feats': fields[5],
                    'head': fields[6],  # ID of the head token
                    'deprel': fields[7], # Dependency relation
                    'deps': fields[8],
                    'misc': fields[9],
                    'head_token': None  # Will be populated after parsing
                }
                current_sentence.append(token)
    
    if current_sentence:
        token_dict = {token['id']: token for token in current_sentence}
        for token in current_sentence:
            token['head_token'] = token_dict.get(token['head'], None)
        
        sentences.append({
            'id': sent_id,
            'tokens': current_sentence,
            'token_dict': token_dict
        })
    
    return sentences


In [16]:
# sample_file = chinese_files[0] # Take first file as sample
for i in chinese_files:
    sample_sentences = parse_conllu_sentences(i)
    print(len(sample_sentences))

500
500
3997


In [7]:
len(sample_sentences)

3997

In [20]:
# Parse all files for the sentences

sample_sentences = []

if chinese_files:
    for file in chinese_files:
        
        sample_sentences = sample_sentences + parse_conllu_sentences(file)
    
print(f"Parsed {len(sample_sentences)} sentences from {len(chinese_files)}")

all_sentences = sample_sentences

Parsed 4997 sentences from 3


In [9]:
# Display a sample sentence
if sample_sentences:
    print("\nSample sentence structure:")
    sample_sentence = sample_sentences[0]
    print(f"Sentence ID: {sample_sentence['id']}")
    print(f"Number of tokens: {len(sample_sentence['tokens'])}")

    ex_sentence = ""

    for i, token in enumerate(sample_sentence['tokens']):
        ex_sentence = ex_sentence + token['form']

    print(f"example sentence: {ex_sentence}")
    # Display first few tokens
    print("\nFirst 5 tokens:")
    for i, token in enumerate(sample_sentence['tokens'][:5]):
        print(f"{i+1}. Form: '{token['form']}', POS: {token['upos']}, Relation: {token['deprel']}")



Sample sentence structure:
Sentence ID: test-s1
Number of tokens: 11
example sentence: 然而，这样的处理也衍生了一些问题。

First 5 tokens:
1. Form: '然而', POS: SCONJ, Relation: mark
2. Form: '，', POS: PUNCT, Relation: punct
3. Form: '这样', POS: PRON, Relation: det
4. Form: '的', POS: PART, Relation: case
5. Form: '处理', POS: NOUN, Relation: nsubj


In [10]:
# Find first sentence that has a linear combination of two or more characters
# e.g. [一][栋], [一这][栋只]
sample_sentence = sample_sentences
for idx, sample_sentence in enumerate(sample_sentences):

    ex_sentence = ""

    for token in sample_sentence['tokens']:
        ex_sentence += token['form']
        
    if re.search(r'[一二][个]', ex_sentence):
        print(f"Match at index {idx}: {ex_sentence}")
        break

Match at index 10: 圆齿龙（Globidens）意为“球状牙齿”，是沧龙科的一个属。


In [11]:
# Find first example that contains a specific character
# skip in case you need another one in the idx if statement 

for idx, sample_sentence in enumerate(sample_sentences):
    ex_sentence = ""

    for token in sample_sentence['tokens']:
        ex_sentence += token['form']
            
    if '个' in ex_sentence:
        print(f"Found at index {idx}: {ex_sentence}")
        if idx in [10 ]:
            continue
        break  # Stop once found

Found at index 10: 圆齿龙（Globidens）意为“球状牙齿”，是沧龙科的一个属。
Found at index 17: 毛泽东早在1949年3月中共七届二中全会的报告中就明确地说：“占国民经济总产值90%的分散的个体的农业经济和手工业经济，是可能和必须谨慎地、逐步地而又积极地引导它们向着现代化和集体化的方向发展的，任其自流的观点是错误的。”


sentence 3
token 9
sentence: 台大医学人文博物馆是一**栋**两层楼的建筑，沿中山南路与仁爱路成L型。
Token: 栋

{'id': '8', 
 'form': '栋', **MW Dong (for buildings)**
 'lemma': '栋', 
 'upos': 'NOUN', 
 'xpos': 'NNB', 
 'feats': '_', 
 'head': '7', 
 'deprel': 'clf', **This is where we get the classifier**
 'deps': '_', 
 'misc': 'SpaceAfter=No|Translit=栋|LTranslit=栋', 
 'head_token': {'id': '7',  **The measureword is dependent on the numeral "1"**
                'form': '一', 
                'lemma': '一', 
                'upos': 'NUM', 
                'xpos': 'CD', 
                'feats': 'NumType=Card', 
                'head': '13', 
                'deprel': 'nummod', 
                'deps': '_', 
                'misc': 'SpaceAfter=No|Translit=yī|LTranslit=yī', 
                'head_token': {...}}

sentence 10
token 17
sentence: 圆齿龙（Globidens）意为“球状牙齿”，是沧龙科的一个属。
{'id': '17', 
'form': '个', 
'lemma': '个', 
'upos': 'NOUN', 
'xpos': 'NNB', 
'feats': '_', 
'head': '16', 
'deprel': 'clf', 
'deps': '_', 
'misc': 'SpaceAfter=No|Translit=gè|LTranslit=gè', 
'head_token': {'id': '16', 
                'form': '一', 
                'lemma': '一', 
                'upos': 'NUM', 
                'xpos': 'CD', '
                feats': 'NumType=Card', 
                'head': '18', 
                'deprel': 'nummod', 
                'deps': '_', 
                'misc': 'SpaceAfter=No|Translit=yī|LTranslit=yī', 
                'head_token': {...}}}

sentence 22
token 2
sentence: 这个计算机控制了火箭从起飞前一直到抛弃S-IVB推进器的操作过程。
token: 个
{'id': '2', 
'form': '个', 
'lemma': '个', 
'upos': 'NOUN', 
'xpos': 'NNB', 
'feats': '_', 
'head': '4', 
'deprel': 'nmod', 
'deps': '_', 
'misc': 'SpaceAfter=No|Translit=gè|LTranslit=gè', 
'head_token': {'id': '4', 
                'form': '机', 
                'lemma': '机', 
                'upos': 'PART', 
                'xpos': 'SFN', 
                'feats': '_', 
                'head': '5', 
                'deprel': 'nsubj', 
                'deps': '_', 'misc': 'SpaceAfter=No|Translit=jī|LTranslit=jī', 
                'head_token': {...}}}

sentence 50
token 19
sentence: 换句话说，如果每个标注的点都在100米的高度，这条线代表的就是100米海拔。

{'id': '19', 'form': '条', 
'lemma': '条', 
'upos': 'NOUN', 
'xpos': 'NNB', 
'feats': '_', 
'head': '20', 
'deprel': 'clf', 
'deps': '_', 
'misc': 'SpaceAfter=No|Translit=tiáo|LTranslit=tiáo', 
'head_token': {'id': '20', 
               'form': '线', 
               'lemma': '线', 
               'upos': 'NOUN', 
               'xpos': 'NN', 
               'feats': '_', 
               'head': '21', 
               'deprel': 'nsubj', 
               'deps': '_', 
               'misc': 'SpaceAfter=No|Translit=xiàn|LTranslit=xiàn', 
               'head_token': {....}}}

Classifiers are always marked as nouns for universal part of speech (upos). 
They differ in their classification under a dependency relation (deprel).

In the case where a MW appears before a numeral: 
- deprel: "clf" (classifier). They are marked as dependent on the numeral.
- This is regardless of whether it is 个 or not

MW + Demonstrative:
- If specific MW: deprel: "clf" (classifier). They are marked as dependent on the noun.
- If 个: deprel: "nmod" (nominal modifier). Marked as dependent on the noun. 

MW + Question word (哪) not present in corpus.


In [12]:
test_sentence = sample_sentences[3]
# print(test_sentence)
test_token = test_sentence['tokens'][7]
test_token
# print(test_token['upos'])

{'id': '8',
 'form': '栋',
 'lemma': '栋',
 'upos': 'NOUN',
 'xpos': 'NNB',
 'feats': '_',
 'head': '7',
 'deprel': 'clf',
 'deps': '_',
 'misc': 'SpaceAfter=No|Translit=栋|LTranslit=栋',
 'head_token': {'id': '7',
  'form': '一',
  'lemma': '一',
  'upos': 'NUM',
  'xpos': 'CD',
  'feats': 'NumType=Card',
  'head': '13',
  'deprel': 'nummod',
  'deps': '_',
  'misc': 'SpaceAfter=No|Translit=yī|LTranslit=yī',
  'head_token': {'id': '13',
   'form': '建筑',
   'lemma': '建筑',
   'upos': 'NOUN',
   'xpos': 'NN',
   'feats': '_',
   'head': '22',
   'deprel': 'advcl',
   'deps': '_',
   'misc': 'SpaceAfter=No|Translit=jiànzhù|LTranslit=jiànzhù',
   'head_token': {'id': '22',
    'form': '成',
    'lemma': '成',
    'upos': 'VERB',
    'xpos': 'VV',
    'feats': '_',
    'head': '0',
    'deprel': 'root',
    'deps': '_',
    'misc': 'SpaceAfter=No|Translit=chéng|LTranslit=chéng',
    'head_token': None}}}}

In [13]:
# For testing: print the full example sentence

sample_sentence = sample_sentences[3]
ex_sentence = ""

for i, token in enumerate(sample_sentence['tokens']):
    ex_sentence = ex_sentence + token['form']

print(f"example sentence: {ex_sentence}")

example sentence: 台大医学人文博物馆是一栋两层楼的建筑，沿中山南路与仁爱路成L型。


# Extract Relevant Info

In [24]:
# ## 4. Identifying Measure Words
# 
# Now we'll create functions to identify measure words (classifiers) in Chinese.
# We'll use multiple criteria including POS tags, dependency relations, and a list of common measure words.

# %%

"""
Check whether a given token is a measure word

Measure words should be marked as a 'upos' noun and have a 'clf' (classifier) relationship to the head

Input: token (dict) - A token dictionary with linguistic annotations
Output: Boolean - True if the token is a measure word, False otherwise
"""
def is_measure_word(token):
    # Most tokens will be marked as 'clf'
    if token['upos'] == 'NOUN' and token['deprel'] == 'clf':
        return True
        
    # in the case of demonstratives and 个 we get a different marking, so include those cases as well 
    elif token['form'] == '个':
        return True
        
    else:
        return False

In [25]:
test_sentence = sample_sentences[3]
test_token = test_sentence['tokens'][7]

print(f"Test numeral + MW, should return true: {is_measure_word(test_token)}")

test_sentence = sample_sentences[10]
test_token = test_sentence['tokens'][16]

print(f"Test numeral + 个, should return true: {is_measure_word(test_token)}")


test_sentence = sample_sentences[22]
test_token = test_sentence['tokens'][1]

print(f"Test a demonstrative + MW, should return true: {is_measure_word(test_token)}")


test_sentence = sample_sentences[50]
test_token = test_sentence['tokens'][18]

print(f"Test a demonstrative + 个, should return true: {is_measure_word(test_token)}")


test_sentence = sample_sentences[50]
test_token = test_sentence['tokens'][16]

print(f"Test a non-MW, should return false: {is_measure_word(test_token)}")


Test numeral + MW, should return true: True
Test numeral + 个, should return true: True
Test a demonstrative + MW, should return true: True
Test a demonstrative + 个, should return true: True
Test a non-MW, should return false: False


In [27]:
# Let's find and count all measure words in our sample
if sample_sentences:
    measure_words = {}
    
    for sentence in sample_sentences:
        for token in sentence['tokens']:
            if is_measure_word(token):
                form = token['form']
                if form in measure_words:
                    measure_words[form] += 1
                else:
                    measure_words[form] = 1
    
    print("Measure words found in the sample:")
    for mw, count in sorted(measure_words.items(), key=lambda x: x[1], reverse=True):
        print(f"'{mw}': {count} occurrences")

Measure words found in the sample:
'个': 632 occurrences
'年': 517 occurrences
'月': 373 occurrences
'种': 129 occurrences
'次': 69 occurrences
'名': 57 occurrences
'条': 51 occurrences
'位': 40 occurrences
'座': 33 occurrences
'部': 31 occurrences
'场': 25 occurrences
'所': 21 occurrences
'段': 20 occurrences
'家': 18 occurrences
'项': 18 occurrences
'世纪': 17 occurrences
'颗': 17 occurrences
'届': 15 occurrences
'间': 15 occurrences
'张': 14 occurrences
'年代': 12 occurrences
'支': 11 occurrences
'任': 11 occurrences
'批': 11 occurrences
'枚': 10 occurrences
'件': 8 occurrences
'日': 8 occurrences
'代': 8 occurrences
'篇': 8 occurrences
'层': 7 occurrences
'套': 7 occurrences
'只': 6 occurrences
'辆': 6 occurrences
'时': 6 occurrences
'首': 5 occurrences
'道': 5 occurrences
'款': 5 occurrences
'米': 4 occurrences
'番': 4 occurrences
'句': 4 occurrences
'岁': 4 occurrences
'台': 4 occurrences
'期': 4 occurrences
'份': 4 occurrences
'艘': 4 occurrences
'组': 4 occurrences
'季': 3 occurrences
'块': 3 occurrences
'起': 3 occurrences
'类'

My code differs from Boxuan's here in the output. Her's had 本 as a MW with 1 occurance, mine does not it seems. Double check this example,

In [28]:
def find_related_measure_words(token):

    if test_token["deprel"] == 'clf':
        if test_token["head_token"]["deprel"] == 'nummod':
            return(test_token["head_token"]["head_token"]["lemma"])
            
        elif test_token["head_token"]["upos"] == "NOUN":
            return(test_token["head_token"]["lemma"])

    elif (test_token["deprel"] == 'nmod') and (test_token["lemma"] == '个'):
        return(test_token["head_token"]["lemma"])

In [29]:
test_sentence = sample_sentences[3]
test_token = test_sentence['tokens'][7]

ex_sentence = ""

for token in test_sentence["tokens"]:
    ex_sentence += token['lemma']

print(f"sentence: {ex_sentence}")
print(f"MW: " + test_token["form"])

find_related_measure_words(test_token)


sentence: 台大医学人文博物馆是一栋两层楼的建筑，沿中山南路与仁爱路成L型。
MW: 栋


'建筑'

In [98]:
test_sentence = sample_sentences[10]
test_token = test_sentence['tokens'][16]

ex_sentence = ""

for token in test_sentence["tokens"]:
    ex_sentence += token['lemma']

print(f"sentence: {ex_sentence}")
print(f"MW: " + test_token["form"])

find_related_measure_words(test_token)


sentence: 圆齿龙（Globidens）意为“球状牙齿”，是沧龙科的一个属。
MW: 个


'属'

In [99]:
test_sentence = sample_sentences[22]
test_token = test_sentence['tokens'][1]

ex_sentence = ""

for token in test_sentence["tokens"]:
    ex_sentence += token['lemma']

print(f"sentence: {ex_sentence}")
print(f"MW: " + test_token["form"])

find_related_measure_words(test_token)


sentence: 这个计算机控制了火箭从起飞前一直到抛弃S-IVB推进器的操作过程。
MW: 个


'机'

In [100]:
test_sentence = sample_sentences[50]
test_token = test_sentence['tokens'][18]

ex_sentence = ""

for token in test_sentence["tokens"]:
    ex_sentence += token['lemma']

print(f"sentence: {ex_sentence}")
print(f"MW: " + test_token["form"])

find_related_measure_words(test_token)


sentence: 换句话说，如果每个标注的点都在100米的高度，这条线代表的是100米海拔。
MW: 条


'线'

In [102]:
test_sentence = sample_sentences[50]
test_token = test_sentence['tokens'][15]

ex_sentence = ""

for token in test_sentence["tokens"]:
    ex_sentence += token['lemma']

print(f"sentence: {ex_sentence}")
print(f"MW: " + test_token["form"])

find_related_measure_words(test_token)


sentence: 换句话说，如果每个标注的点都在100米的高度，这条线代表的是100米海拔。
MW: 高度


In [20]:
# ## 5. Finding Noun-Measure Word Relationships
# 
# We'll use dependency relations to find measure words related to nouns, regardless of their position in the sentence.

# %%
def find_related_measure_words(sentence, noun_token):
    """
    Finds measure words related to a specific noun using dependency relations.
    
    Input: 
        - sentence (dict): A sentence dictionary containing tokens and dependency information
        - noun_token (dict): The token dictionary for the noun we're analyzing
    
    Output: Tuple of two lists:
        - pre_mws: List of measure word tokens that appear before the noun
        - post_mws: List of measure word tokens that appear after the noun
    """
    pre_mws = []
    post_mws = []
    noun_id = int(noun_token['id'])
    
    for token in sentence['tokens']:
        if is_measure_word(token):
            mw_id = int(token['id'])
            
            # Check if the measure word is directly related to the noun
            # This handles cases where the measure word and noun have a direct dependency
            if token['head'] == noun_token['id'] or noun_token['head'] == token['id']:
                if mw_id < noun_id:
                    pre_mws.append(token)
                else:
                    post_mws.append(token)
                continue
                
            # Check if they share a common head (like a numeral)
            # This handles cases like "三个苹果" where both "个" and "苹果" depend on "三"
            if token['head'] == noun_token['head'] and token['head'] != '0':
                if mw_id < noun_id:
                    pre_mws.append(token)
                else:
                    post_mws.append(token)
                continue
                
            # Check if the measure word is connected to a determiner or numeral that connects to the noun
            # This handles cases with more complex structures
            if token['head_token'] and token['head_token']['head'] == noun_token['id']:
                if mw_id < noun_id:
                    pre_mws.append(token)
                else:
                    post_mws.append(token)
                continue

            # Check if the numeral connects to the measure word connects to the noun
            # This handles another common structure
            if noun_token['head_token'] and noun_token['head_token']['head'] == token['id']:
                if mw_id < noun_id:
                    pre_mws.append(token)
                else:
                    post_mws.append(token)
    
    return pre_mws, post_mws

# %%
# Let's examine some examples of noun-measure word relationships
if sample_sentences:
    examples = []
    
    for sentence in sample_sentences[:5]:  # Look at first 5 sentences
        for token in sentence['tokens']:
            if token['upos'] == 'NOUN':
                pre_mws, post_mws = find_related_measure_words(sentence, token)
                
                if pre_mws or post_mws:
                    # Store this as an example
                    examples.append({
                        'noun': token['form'],
                        'pre_mws': [t['form'] for t in pre_mws],
                        'post_mws': [t['form'] for t in post_mws]
                    })
    
    print(f"Found {len(examples)} examples of noun-measure word relationships:")
    for i, example in enumerate(examples[:10]):  # Show first 10 examples
        print(f"\nExample {i+1}:")
        print(f"Noun: {example['noun']}")
        print(f"Pre-noun measure words: {', '.join(example['pre_mws']) if example['pre_mws'] else 'None'}")
        print(f"Post-noun measure words: {', '.join(example['post_mws']) if example['post_mws'] else 'None'}")


Found 6 examples of noun-measure word relationships:

Example 1:
Noun: 栋
Pre-noun measure words: None
Post-noun measure words: 栋

Example 2:
Noun: 层
Pre-noun measure words: None
Post-noun measure words: 层

Example 3:
Noun: 楼
Pre-noun measure words: 层
Post-noun measure words: None

Example 4:
Noun: 建筑
Pre-noun measure words: 栋
Post-noun measure words: None

Example 5:
Noun: 天文
Pre-noun measure words: None
Post-noun measure words: 台

Example 6:
Noun: 现
Pre-noun measure words: 台
Post-noun measure words: None


In [21]:
# ## 6. Collecting and Counting Noun-Measure Word Pairs
# 
# Now we'll process all sentences to collect statistics on noun-measure word pairs.

# %%
def collect_noun_mw_pairs(sentences):
    """
    Collects all noun-measure word pairs from a list of sentences using dependency relations.
    
    Input: sentences (list) - List of sentence dictionaries with tokens and dependencies
    
    Output: defaultdict - A nested dictionary structure:
            {noun: {measure_word: {'pre': count, 'post': count}, ...}, ...}
            Where 'pre' is the count of the measure word appearing before the noun,
            and 'post' is the count of the measure word appearing after the noun.
            'NA' is used when no measure word is found.
    """
    noun_mw_data = defaultdict(lambda: defaultdict(lambda: {'pre': 0, 'post': 0}))
    
    for sentence in sentences:
        for token in sentence['tokens']:
            # If current token is a noun
            if token['upos'] == 'NOUN':
                noun = token['form']
                pre_mws, post_mws = find_related_measure_words(sentence, token)
                
                # Count pre-noun measure words
                if pre_mws:
                    for mw_token in pre_mws:
                        noun_mw_data[noun][mw_token['form']]['pre'] += 1
                else:
                    noun_mw_data[noun]['NA']['pre'] += 1
                
                # Count post-noun measure words
                if post_mws:
                    for mw_token in post_mws:
                        noun_mw_data[noun][mw_token['form']]['post'] += 1
                else:
                    noun_mw_data[noun]['NA']['post'] += 1
    
    return noun_mw_data

# %%
# Collect noun-measure word pairs from our sample
sample_noun_mw_data = collect_noun_mw_pairs(sample_sentences)

print(f"Found {len(sample_noun_mw_data)} unique nouns in the sample")

# Display a few examples
print("\nSample of noun-measure word pairs:")
sample_count = 0
for noun, mw_dict in sample_noun_mw_data.items():
    if sample_count >= 5:
        break
        
    print(f"\nNoun: {noun}")
    for mw, counts in mw_dict.items():
        if mw != 'NA' and (counts['pre'] > 0 or counts['post'] > 0):
            print(f"  Measure word: {mw}")
            print(f"    Pre-noun count: {counts['pre']}")
            print(f"    Post-noun count: {counts['post']}")
    
    sample_count += 1

Found 1705 unique nouns in the sample

Sample of noun-measure word pairs:

Noun: 处理

Noun: 问题

Noun: 年
  Measure word: 年
    Pre-noun count: 0
    Post-noun count: 50
  Measure word: 位
    Pre-noun count: 1
    Post-noun count: 0

Noun: 大楼
  Measure word: 位
    Pre-noun count: 1
    Post-noun count: 1
  Measure word: 幢
    Pre-noun count: 1
    Post-noun count: 0

Noun: 构想


In [22]:
# ## 7. Processing All Files and Generating CSV

# %%
def write_to_csv(noun_mw_data, output_file):
    """
    Writes the noun-measure word data to a CSV file.
    
    Input:
        - noun_mw_data (defaultdict): The nested dictionary of noun-measure word pairs and counts
        - output_file (string): Path to the output CSV file
    
    Output: None (writes to a file on disk)
    """
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Noun', 'MW', 'Count_Pre', 'Count_Post'])
        
        for noun in sorted(noun_mw_data.keys()):
            for mw in sorted(noun_mw_data[noun].keys()):
                writer.writerow([
                    noun,
                    mw,
                    noun_mw_data[noun][mw]['pre'],
                    noun_mw_data[noun][mw]['post']
                ])
    
    print(f"CSV file saved as {output_file}")
    
    # Return a DataFrame version for further analysis
    rows = []
    for noun in sorted(noun_mw_data.keys()):
        for mw in sorted(noun_mw_data[noun].keys()):
            rows.append({
                'Noun': noun,
                'MW': mw,
                'Count_Pre': noun_mw_data[noun][mw]['pre'],
                'Count_Post': noun_mw_data[noun][mw]['post']
            })
    
    return pd.DataFrame(rows)

# %%
def process_all_chinese_files(file_paths, output_file='chinese_noun_mw.csv'):
    """
    Processes all Chinese files and combines the results into a single CSV.
    
    Input:
        - file_paths (list): List of file paths to CoNLL-U files
        - output_file (string): Path for the output CSV file
    
    Output: DataFrame containing the noun-measure word data
    """
    all_sentences = []
    
    for file_path in file_paths:
        print(f"Processing {file_path}...")
        sentences = parse_conllu_sentences(file_path)
        all_sentences.extend(sentences)
    
    print(f"Total sentences processed: {len(all_sentences)}")
    
    noun_mw_data = collect_noun_mw_pairs(all_sentences)
    df = write_to_csv(noun_mw_data, output_file)
    
    # Print some statistics
    total_nouns = len(noun_mw_data)
    total_pairs = sum(len(mws) for mws in noun_mw_data.values())
    print(f"Found {total_nouns} unique nouns with {total_pairs} noun-measure word pairs")
    
    return df

# %%
# Process all Chinese files and generate the CSV
# Note: This may take some time depending on the corpus size
conllu_files = [f for f in chinese_files if f.endswith('.conllu')]
df_results = process_all_chinese_files(conllu_files, OUTPUT_CSV)

Processing /Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu...
Processing /Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu...
Processing /Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/Universal Dependencies 2.15/ud-treebanks-v2.15/UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu...
Total sentences processed: 4997
CSV file saved as /Volumes/Server/SHARED/Corpora/Universal_Dependencies/2025_InformationTheory_Project/chinese_noun_mw.csv
Found 8128 unique nouns with 10136 noun-measure word pairs


# Count Nouns

Create and export dataframe with all nouns in the corpus and their respective counts

In [24]:

all_nouns = []

for sentence in all_sentences:
    for token in sentence['tokens']:
        # all dependency relations that are "modifier words" and "function words" 
        #(i.e. words that may be tagged as Nouns for mandarin but do not appear as nouns) 
        other_deprel = ["csubj", "ccomp", "advcl", "acl", "advmod", "discourse", "amod", "aux", "cop", "mark", "det", "clf", "case"]
        
        # If current token is a noun
        if (token['upos'] == 'NOUN') and (token["deprel"] not in other_deprel):
            all_nouns.append(token['lemma'])
          
print(f"Total number of nouns in the corpus: {len(all_nouns)}")

Total number of nouns in the corpus: 30764


In [31]:
all_nouns_df = pd.DataFrame(all_nouns, columns=['Noun'])

all_nouns_df = all_nouns_df['Noun'].value_counts().reset_index()
all_nouns_df.columns = ['Noun', 'Count']

print(f"Total number of unique nouns in the corpus: {len(all_nouns_df.index)}")
all_nouns_df.head()


Total number of unique nouns in the corpus: 8013


Unnamed: 0,Noun,Count
0,年,1038
1,日,373
2,人,353
3,月,231
4,人口,145


In [33]:
all_nouns_df.to_csv("output/all_UD-Mandarin-corpus_nouns.csv", header=True)
