https://www.lancaster.ac.uk/fass/projects/corpus/LCMC/

In [4]:
import os
import re
import xml.etree.ElementTree as ET

# T9 mapping for letters to digits
t9_map = {
    'a': '2', 'b': '2', 'c': '2',
    'd': '3', 'e': '3', 'f': '3',
    'g': '4', 'h': '4', 'i': '4',
    'j': '5', 'k': '5', 'l': '5',
    'm': '6', 'n': '6', 'o': '6',
    'p': '7', 'q': '7', 'r': '7', 's': '7',
    't': '8', 'u': '8', 'v': '8',
    'w': '9', 'x': '9', 'y': '9', 'z': '9',
}

def pinyin_to_digits(pinyin_seq):
    return ''.join(t9_map.get(ch, '') for syllable in pinyin_seq for ch in syllable if ch.isalpha())

def normalize_pinyin(syllables):
    # pinyin of hanzi like "吕" shoud be "lv" but in data is "luu"
    return [syll.replace('uu', 'v') for syll in syllables]

def split_polyphonic(word):
    return re.findall(r'[a-z]+[1-5]?', word)

def extract_samples(pinyin_path, character_path):
    tree_pinyin = ET.parse(pinyin_path)
    tree_character = ET.parse(character_path)
    
    root_pinyin = tree_pinyin.getroot()
    root_character = tree_character.getroot()

    samples = []

    for sp, sc in zip(root_pinyin.iter('s'), root_character.iter('s')):
        pinyin_words = [w.text.strip() for w in sp.findall('w')]
        char_words = [w.text.strip() for w in sc.findall('w')]

        if len(pinyin_words) != len(char_words):
            continue  # skip unmatched lengths
        for pinyin, hanzi in zip(pinyin_words, char_words):
            syllables = split_polyphonic(pinyin)
            normalized = normalize_pinyin(syllables)
            digit_seq = pinyin_to_digits(normalized)

            if digit_seq and hanzi:
                samples.append((digit_seq, hanzi))
    
    return samples

In [11]:
pinyin_dir = 'data/LCMC/2474/Lcmc/data/pinyin'
char_dir = 'data/LCMC/2474/Lcmc/data/character'

all_samples = []

for filename in os.listdir(pinyin_dir):
    if filename.endswith('.xml'):
        pinyin_path = os.path.join(pinyin_dir, filename)
        char_path = os.path.join(char_dir, filename)
        if os.path.exists(char_path):
            samples = extract_samples(pinyin_path, char_path)
            all_samples.extend(samples)
            
print(f"Extracted {len(all_samples)} samples.")

Extracted 831576 samples.


In [6]:
with open('data/zh_T9_dataset.tsv', 'w', encoding='utf-8') as f:
    for inp, tgt in all_samples:
        f.write(f"{inp}\t{tgt}\n")

In [12]:
def extract_sentences_from_character_xml(xml_file):
    sentences = []
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for s in root.iter('s'):
        tokens = []
        for elem in s:
            if elem.tag in ('w', 'c'):
                text = elem.text.strip() if elem.text else ''
                if text:
                    tokens.append(text)
        if tokens:
            sentence = ''.join(tokens)
            sentences.append(sentence)
    return sentences

all_sentences = []

for filename in os.listdir(char_dir):
    if filename.endswith('.XML'):
        filepath = os.path.join(char_dir, filename)
        sentences = extract_sentences_from_character_xml(filepath)
        all_sentences.extend(sentences)

print(f"Extracted {len(all_sentences)} sentences.")

Extracted 45703 sentences.


In [13]:
with open('data/zh_sent_dataset.tsv', 'w', encoding='utf-8') as f:
    for sent in all_sentences:
        f.write(sent + '\n')