# Chinese Word Segmentation

## Download the data and Prepare training/test data

In [12]:
import requests

remote_url = "https://raw.githubusercontent.com/hhhuang/nlp2019fall/master/word_segmentation/"

r = requests.get(remote_url + "data/as_training.utf8", allow_redirects=True)
open('as_training.utf8', 'wb').write(r.content)

r = requests.get(remote_url + "data/as_testing_gold.utf8", allow_redirects=True)
open('as_testing_gold.utf8', 'wb').write(r.content)

942571

In [2]:
raw_train = []
raw_test = []
with open("as_training.utf8", encoding="utf8") as fin:
    for line in fin:
        raw_train.append(line.strip().split("　"))   # It is a full white space

with open("as_testing_gold.utf8", encoding="utf8") as fin:
    for line in fin:
        raw_test.append(line.strip().split("　"))   # It is a full white space

print("Number of sentences in the training data: %d" % len(raw_train))
print("Number of sentences in the test data: %d" % len(raw_test))

Number of sentences in the training data: 708953
Number of sentences in the test data: 14432


## Use jieba

In [3]:
import jieba

print(list(jieba.cut("".join(raw_test[0]))))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\CHRIST~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.243 seconds.
Prefix dict has been built successfully.


['許多', '社區長', '青學苑', '多', '開設', '有書法', '、', '插花', '、', '土風', '舞班', '，']


## Build Your Own Chinese Word Segmenter

## Prepare training instances for sequence labeling by converting a list of words to a sequence of tags

In [4]:
def words_to_tags(words):
    tags = []
    for word in words:
        if len(word) == 1:
            tags.append('S')
        else:
            for i in range(len(word)):
                if i == 0:
                    tags.append('L')
                elif i == len(word) - 1:
                    tags.append('R')
                else:
                    tags.append('M')
    return tags
    
train_X = []
train_Y = []

test_X = []
test_Y = []

for sent in raw_train:
    train_X.append(list("".join(sent)))  # Make the unsegmented sentence as a sequence of characters
    train_Y.append(words_to_tags(sent))
    
for sent in raw_test:
    test_X.append(list("".join(sent)))  # Make the unsegmented sentence
    test_Y.append(words_to_tags(sent))
    
print(test_X[0])
print(test_Y[0])

['許', '多', '社', '區', '長', '青', '學', '苑', '多', '開', '設', '有', '書', '法', '、', '插', '花', '、', '土', '風', '舞', '班', '，']
['L', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'M', 'M', 'R', 'S']


## Create a CRF model for word segmentation

In [5]:
#!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting tabulate
  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp38-cp38-win_amd64.whl (156 kB)
Installing collected packages: tabulate, python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tabulate-0.8.7


In [7]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def extract_sent_features(x):
    sent_features = []
    for i in range(len(x)):
        sent_features.append(extract_char_features(x, i))
    return sent_features
    
def extract_char_features(sent, position):
    char_features = {}
    for i in range(-3, 4):
        if len(sent) > position + i >= 0:
            char_features['char_at_%d' % i] = sent[position + i]

    for i in range(-2, 4):
        if len(sent)-1 > position + i >= 0:
            char_features['char_at_%d_%d' % (i, i+1)] = sent[position + i] + sent[position + i+1]
    return char_features

crf_tagger = sklearn_crfsuite.CRF(algorithm='lbfgs', min_freq=10, max_iterations=150, verbose=True)

feature_X = []
for x in train_X:
    feature_X.append(extract_sent_features(x))
crf_tagger.fit(feature_X, train_Y)

loading training data to CRFsuite: 100%|██████████████████████████████████████| 708953/708953 [18:28<00:00, 639.44it/s]



Feature generation
type: CRF1d
feature.minfreq: 10.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 668078
Seconds required: 113.456

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=15.29 loss=9521412.85 active=666051 feature_norm=1.00
Iter 2   time=14.82 loss=5941119.32 active=668078 feature_norm=5.09
Iter 3   time=15.37 loss=5608706.98 active=668078 feature_norm=5.62
Iter 4   time=7.40  loss=5288120.25 active=668078 feature_norm=6.42
Iter 5   time=7.17  loss=3858885.29 active=668078 feature_norm=12.43
Iter 6   time=7.67  loss=3567340.64 active=668078 feature_norm=14.76
Iter 7   time=7.78  loss=3329652.89 active=668078 feature_norm=14.68
Iter 8   time=7.46  loss=3151551.35 active=668078 feature_norm=16.51
Iter 9   time=7.52  loss=2935182.17 active=6680

Iter 117 time=7.42  loss=638789.95 active=668078 feature_norm=192.36
Iter 118 time=7.62  loss=636433.85 active=668078 feature_norm=193.05
Iter 119 time=7.57  loss=631863.81 active=668078 feature_norm=194.69
Iter 120 time=7.59  loss=629039.57 active=668078 feature_norm=196.84
Iter 121 time=7.25  loss=626447.85 active=668078 feature_norm=198.89
Iter 122 time=7.39  loss=622299.94 active=668078 feature_norm=201.86
Iter 123 time=7.08  loss=619241.79 active=668078 feature_norm=204.75
Iter 124 time=7.91  loss=617767.01 active=668078 feature_norm=203.71
Iter 125 time=8.70  loss=615849.71 active=668078 feature_norm=202.56
Iter 126 time=7.22  loss=613186.50 active=668078 feature_norm=202.07
Iter 127 time=15.05 loss=610653.88 active=668078 feature_norm=203.45
Iter 128 time=7.09  loss=605939.12 active=668078 feature_norm=204.41
Iter 129 time=7.15  loss=603498.64 active=668078 feature_norm=206.19
Iter 130 time=7.17  loss=601765.70 active=668078 feature_norm=208.09
Iter 131 time=7.75  loss=599947.45



CRF(algorithm='lbfgs', keep_tempfiles=None, max_iterations=150, min_freq=10,
    verbose=True)

## Evaluation

In [8]:
def compare(actual_toks, pred_toks):
    i = 0
    j = 0
    p = 0
    q = 0
    tp = 0
    fp = 0
    while i < len(actual_toks) and j < len(pred_toks):
        if p == q:
            if actual_toks[i] == pred_toks[j]:
                tp += 1
            else:
                fp += 1
            p += len(actual_toks[i])
            q += len(pred_toks[j])
            i += 1
            j += 1
        elif p < q:
            p += len(actual_toks[i])
            i += 1
        else:
            fp += 1
            q += len(pred_toks[j])
            j += 1
    return tp, fp, len(actual_toks)
    
def score(actual_sents, pred_sents):
    tp = 0
    fp = 0
    total = 0
    for actual_toks, pred_toks in zip(actual_sents, pred_sents):
        tp_, fp_, total_ = compare(actual_toks, pred_toks)
        tp += tp_
        fp += fp_
        total += total_
    recall = float(tp) / total
    precision = float(tp) / (tp + fp)
    f1 = 2.0 * recall * precision / (recall + precision)
    return recall, precision, f1

In [10]:
def segment(sent):
    tags = crf_tagger.predict_single(extract_sent_features(list(sent)))
    tokens = []
    tok = ""
    for ch, tag in zip(list(sent), tags):
        if tag in ['S', 'L'] and tok != "":
            tokens.append(tok)
            tok = ""
        tok += ch
    if tok:
        tokens.append(tok)
    return tokens
            
print(segment("法國總統馬克宏已到現場勘災，初步傳出火警可能與目前聖母院的維修工程有關。"))

['法國', '總統', '馬克宏', '已', '到', '現場', '勘災', '，', '初步', '傳出', '火警', '可能', '與', '目前', '聖母院', '的', '維修', '工程', '有關', '。']


In [11]:
pred = []
actual = []
for sent in raw_test:
    pred.append(segment("".join(sent)))
    actual.append(sent)
print(actual[0])
print(pred[0])

print(score(actual, pred))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
['許多', '社區', '長青', '學苑', '多', '開', '設有', '書法', '、', '插花', '、', '土風舞班', '，']
(0.9266164285248255, 0.9166438079870916, 0.9216031407412214)
