In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from utils.imports import *

In [3]:
fn_corpora = Path('datasets/北大人民日报语料库.txt')

with open(fn_corpora, 'r', encoding='gbk') as f:
    raw_corpora = f.read()

sentences = raw_corpora.split('\n')

sentence = sentences[1]
sentence.split('  ')

['19980101-01-001-002/m',
 '中共中央/nt',
 '总书记/n',
 '、/w',
 '国家/n',
 '主席/n',
 '江/nr',
 '泽民/nr',
 '']

In [4]:
words = [o for o in sentence.split('  ')[1:-1]]

words

['中共中央/nt', '总书记/n', '、/w', '国家/n', '主席/n', '江/nr', '泽民/nr']

# Collect words

In [5]:
words = [o for sentence in sentences for o in sentence.split('  ')[1:-1]]

len(words), words[:5]

(1121429, ['迈向/v', '充满/v', '希望/n', '的/u', '新/a'])

In [6]:
# clean annotations
words_clean = [o.split('/')[0] for o in words]
len(words_clean), words_clean[:5]

(1121429, ['迈向', '充满', '希望', '的', '新'])

In [7]:
sentences_space = ' '.join(words_clean)
sentences_space[:10]

'迈向 充满 希望 的'

In [9]:
def create_labels(sentences_space):
    sentences_space_np = np.array(list(sentences_space))
    space_bool = (sentences_space_np == ' ')
    space_idx = np.argwhere(space_bool).squeeze()

    shift = np.arange((space_bool == True).sum())

    b_idx = space_idx - shift
    b_idx

    concatenated = ''.join(words_clean)
    b_bool = np.zeros(len(concatenated))
    b_bool[b_idx] = 1

    prepared_sentences = list(zip(concatenated, b_bool))
    return prepared_sentences

prepared_sentences = create_labels(sentences_space)

prepared_sentences[:4], len(prepared_sentences)

[('迈', 0.0), ('向', 0.0), ('充', 1.0), ('满', 0.0)]

In [18]:
def create_char_features(sentence, i):
    features = [
        'bias',
        'char=' + sentence[i][0] 
    ]
    
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
        
        
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
        
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
        
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    
    return features



def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [26]:
valid_ratio = 0.2
n_samples = len(prepared_sentences)
n_valid = int(valid_ratio * n_samples)

In [27]:
X = create_sentence_features(prepared_sentences[:-n_valid])
y = create_sentence_labels(prepared_sentences[:-n_valid])

X_test = create_sentence_features(prepared_sentences[-n_valid:])
y_test = create_sentence_labels(prepared_sentences[-n_valid:])

In [31]:
len(X), len(y)

(1480700, 1480700)

In [32]:
X[0], y[0]

(['bias',
  'char=迈',
  'BOS',
  'char+1=向',
  'char:+1=迈向',
  'char+2=充',
  'char:+2=迈向充',
  'char+1:+2=向充',
  'char:+3=迈向充满',
  'char+1:+3=向充满'],
 '0.0')

In [33]:
import pycrfsuite

trainer = pycrfsuite.Trainer(verbose=False)

trainer.append(X, y)
trainer.set_params({
    'c1': 1.0, 
    'c2': 1e-3,
    'max_iterations': 60,
    'feature.possible_transitions': True
})


In [34]:
trainer.train('latin-text-segmentation.crfsuite')

In [35]:
tagger = pycrfsuite.Tagger()
tagger.open('latin-text-segmentation.crfsuite')

<contextlib.closing at 0x7f1002133a58>

In [55]:
tp = 0
fp = 0
fn = 0
n_correct = 0
n_incorrect = 0

prediction = tagger.tag(create_sentence_features(prepared_sentences))
correct = create_sentence_labels(prepared_sentences)
zipped = list(zip(prediction, correct))
tp +=        len([_ for l, c in zipped if l == c and l == 1])
fp +=        len([_ for l, c in zipped if l == 1 and c == 0])
fn +=        len([_ for l, c in zipped if l == 0 and c == 1])
n_incorrect += len([_ for l, c in zipped if l != c])
n_correct   += len([_ for l, c in zipped if l == c])

In [69]:
y_true, y_scores = np.array(correct, dtype=np.float), np.array(prediction, dtype=np.float)
from sklearn.metrics import average_precision_score
average_precision_score(y_true, y_scores)

0.7658562029872773

In [70]:
print("Precision:\t" + str(tp/(tp+fp)))
print("Recall:\t\t" + str(tp/(tp+fn)))
print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect)))

ZeroDivisionError: division by zero