In [None]:
import bs4, os
from pymorphy2 import tokenizers
import pymorphy2 as pm
import pandas as pd

import sklearn
from sklearn.model_selection import KFold

import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import metrics

Parsing annotation info from xml files

*Парсим информацию о разметке из xml-файлов*

In [None]:
def parsefile(fn):
    f = open(fn, encoding = 'utf-8')
    s = f.read()
    f.close()
    soup =bs4.BeautifulSoup(s, "lxml")
    d = soup.find('document')
    fl = 0
    wfs = []
    for c in d.children:
        if c.name == 'header':
            fl = 1
        if fl:
#Tokenizing text that received no annotation and adding the 'O' tag to every word in it
#Неразмеченный текст токенизируем и приписываем помету 'O'
            if c.name == None:
                ws = tokenizers.simple_word_tokenize(c.title())
                for w in ws:
                    d = {'word': w, 'label':'O'}
                    wfs.append(d)
#Reading annotation info for words which received relevant annotation:
#Размеченные слова: 
            elif c.name == 'segment':
                d = {'word': c.get_text()}
                ann = c.get('features').split(u';')
#Chacking annotation and assigning annotation tags to words
#Проверяем разметку и приписываем соответствующие значения словам
                if len(ann) == 2 and ann[1][-1] != u'_':
                    d['label'] = ann[1]
                else:
                    d['label'] = 'O'
                #print (d)
                wfs.append(d)
    return wfs

Feature extraction for every wordform:

-long list: word, normal form, first 2 and 5 letters, case and digits, part of speech, tense and person (for verbs);

-short list (for words over 1 word away from the current one): word, first 5 letters, part of speech.


*Вычисляем параметры для всех словоформ:*

*длинный список: слово, нормальная форма, первые 2 и 5 букв слова, хар-ки регистра и цифры, часть речи, время (глагола), лицо (глагола)
короткий список (для отдаленных слов): слово, первые 5 букв, часть речи.*

In [None]:
m = pm.MorphAnalyzer()
def wordfeats(word, pref = ''):
    wl = word.lower()
    t = m.tag(word)[0]
    feats = {
        pref + 'lower': wl,
        pref + 'word_2': wl[:2],
        pref + 'word_5': wl[:5],
        pref + 'lemma': m.normal_forms(word)[0],
        pref + 'isupper': word.isupper(),
        pref + 'istitle': word.istitle(),
        pref + 'isdigit': word.isdigit(),
        pref + 'postag': str(t.POS),
        pref + 'tense': str(t.tense),
        pref + 'person': str(t.person) 
    }
    return feats

def minifeats(word, pref = ''):
    wl = word.lower()
    t = m.tag(word)[0]
    feats = {
        pref + 'lower': wl,
        pref + 'word_5': wl[:5],
        pref + 'postag': str(t.POS),
    }
    return feats

def getfeats(text, i):
    word = text[i]
    feats = wordfeats(text[i])
    if i > 0:
        feats.update(wordfeats(text[i-1], pref = 'mi_'))
    if i < len(text)-1:
        feats.update(wordfeats(text[i+1], pref = 'pl_'))
    if i < len(text)-2:
        feats.update(minifeats(text[i+2], pref = 'pl2_'))
    return feats

Reading xml's and extracting features

*Читаем xml и вычисляем параметры*

In [None]:
def geddicts(foo):
    fns = os.listdir(foo)
    allfs = []
    allbs = []
    for fn in fns:
        #print (fn)
        wfs = parsefile(foo + fn)
        ws = [d['word'] for d in wfs]
        labels = [d['label'] for d in wfs]
        feats = [getfeats(ws, i) for i, w in enumerate(ws)]
        print (fn, '\t', len(ws), 'words')
        allfs.append(feats)
        allbs.append(labels)
    #df = pd.DataFrame(allfs)
    #df['label'] = allbs
    #df.to_csv(u'c:\D\PolinaP\work\ifmo\data\\' + resfn)
    return np.array(allfs), np.array(allbs), fns

Experiment settings:

3-fold cross-validation (document-based), training a conditional random fields classifier, evaluation (flat_classification_report - words-based)


*Проводим эксперимент:* 
*3-х-кратная кросс-валидация по документам, обучение conditional random fields, тестирование и оценка результатов по индивидуальным словам (flat_classification_report), а не документам.*

In [None]:
def evalcrf(X, y):
    kf = KFold(n_splits=3, shuffle=True)
    alltest = np.array([])
    allpred = np.array([])
    testind = np.array([], int)
    for train, test in kf.split(X):
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
#Taking some initial hyper-parameter values from the crf-suite documentation
#The hyper-parameters could be additionally tuned with the data
#Берем гиперпараметры классификатора навскидку из документации библиотеки crfsuite.
#В дальнейшем гиперпараметры можно будет подобрать на обучающей выборке.
        crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', 
        c1=0.1, 
        c2=0.1, 
        max_iterations=100, 
        all_possible_transitions=True)
    
        crf.fit(X_train, y_train)
        y_pred = crf.predict(X_test)

        alltest = np.concatenate((alltest, y_test))
        allpred = np.concatenate((allpred, y_pred))
        testind = np.concatenate((testind, test))

#Results output for the relevant classes: (neg, phen_b, phen_i), excluding the 'O'-class        
#Выводим результаты только для интересующих нас классов (neg, phen_b, phen_i), без "нулевого" класса O    
    labels = list(crf.classes_)
    labels.remove('O')

    print(metrics.flat_classification_report(
            alltest, allpred, labels=labels, digits=3
        ))
    return alltest, allpred, testind

Reading actual data and extracting features

*Читаем данные и вычисляем параметры*

In [None]:
foo = r'.\annotated_data\\'
cfeatures, clabels, fns = geddicts(foo)
print(u'Документов: ', len(cfeatures))

10003_С2015 11856_С2015.xml.xml 	 296 words
10004_С2011 10494_С2011.xml.xml 	 405 words
10004_С2015.xml.xml 	 275 words
10006_С2013.xml.xml 	 11 words
10007_С2013.xml.xml 	 154 words
10008_С2012.xml.xml 	 176 words
10008_С2015.xml.xml 	 212 words
10010_С2012.xml.xml 	 67 words
10011_С2013.xml.xml 	 141 words
10012_С2013.xml.xml 	 11 words
10013_С2013.xml.xml 	 107 words
10014_С2013.xml.xml 	 11 words
10015_С2012.xml.xml 	 5 words
10015_С2013.xml.xml 	 11 words
10016_С2013.xml.xml 	 111 words
10021_С2015.xml.xml 	 1097 words
10031_С2011.xml.xml 	 80 words
10039_С2011.xml.xml 	 175 words
1004_С2015.xml.xml 	 11 words
10083_С2011.xml.xml 	 116 words
10092_С2012.xml.xml 	 83 words
10099_С2010.xml.xml 	 97 words
10101_С2010 10540_С2010 11638_С2010.xml.xml 	 552 words
10107_С2011.xml.xml 	 162 words
10109_С2011.xml.xml 	 155 words
10113_С2011.xml.xml 	 234 words
10114_С2011.xml.xml 	 116 words
10145_С2010.xml.xml 	 24 words
10149_С2011 10612_С2011.xml.xml 	 148 words
10151_С2011 14936_С2011.

Experiment with actual data. Results output on relevant classes.

*Эксперимент с данными. Выводим результаты по интересующим нас классам.*

In [None]:
alltest, allpred, testind = evalcrf(cfeatures, clabels)

             precision    recall  f1-score   support

     phen-b      0.862     0.855     0.858       110
     phen-i      0.884     0.894     0.889       274
        neg      0.906     0.928     0.917       125

avg / total      0.885     0.894     0.889       509



Output of errors by documents and words

*Вывод ошибок по документам и словам*

In [None]:
for i, d in enumerate(alltest):
    print(fns[testind[i]])
    for j, lab in enumerate(d):
        if lab != allpred[i][j]:
            print(j, cfeatures[testind[i]][j]['lower'], lab, allpred[i][j])

10007_С2013.xml.xml
75 болей phen-i phen-b
10008_С2012.xml.xml
0 анамнез phen-b O
1 гипертонической phen-i phen-b
162 осложнений O phen-b
163 не O neg
164 было O phen-i
10010_С2012.xml.xml
10015_С2013.xml.xml
10016_С2013.xml.xml
10039_С2011.xml.xml
1004_С2015.xml.xml
10101_С2010 10540_С2010 11638_С2010.xml.xml
10107_С2011.xml.xml
10114_С2011.xml.xml
0 анамнеза phen-b O
1 аг phen-i O
2 нет neg O
10149_С2011 10612_С2011.xml.xml
10178_С2011.xml.xml
10180_С2014.xml.xml
10195_С2011.xml.xml
10219_С2015.xml.xml
87 повышение phen-b O
88 ад phen-i O
89 : phen-i O
90 давность phen-i O
91 не neg O
92 отмечает phen-i O
10225_С2013.xml.xml
10235_С2015.xml.xml
10238_С2012.xml.xml
76 лечение phen-b O
77 : phen-i O
10263_С2011.xml.xml
19 постоянно O phen-b
20 ад phen-b phen-i
44 без neg O
45 иррадиации phen-i O
10286_С2014.xml.xml
775 ангинозные phen-b O
776 боли phen-i O
777 : phen-i O
778 нек neg O
779 рецидивируют phen-i O
10333_С2012.xml.xml
376 статинами O phen-b
377 боли phen-b phen-i
10338_С201

Output of correct results by documents and words

*Вывод верных срабатываний по документам и словам*

In [None]:
for i, d in enumerate(alltest):
    print(fns[testind[i]])
    for j, lab in enumerate(d):
        if lab == allpred[i][j] and lab != 'O':
            print(j, cfeatures[testind[i]][j]['lower'], lab, allpred[i][j])

10007_С2013.xml.xml
19 не neg neg
20 наблюдалась phen-i phen-i
22 лекарственные phen-b phen-b
23 препараты phen-i phen-i
24 не neg neg
25 принимала phen-i phen-i
59 без neg neg
60 иррадиации phen-i phen-i
63 не neg neg
64 купировалась phen-i phen-i
76 не neg neg
77 было phen-i phen-i
84 госпитализация phen-b phen-b
85 не neg neg
86 предлагалась phen-i phen-i
149 лекарственные phen-b phen-b
150 препараты phen-i phen-i
151 не neg neg
152 принимала phen-i phen-i
10008_С2012.xml.xml
2 болезни phen-i phen-i
3 отрицает neg neg
30 регулярную phen-b phen-b
31 терапию phen-i phen-i
32 не neg neg
33 получал phen-i phen-i
78 ангинозные phen-b phen-b
79 боли phen-i phen-i
80 не neg neg
81 беспокоили phen-i phen-i
10010_С2012.xml.xml
10015_С2013.xml.xml
10016_С2013.xml.xml
15 повышение phen-b phen-b
16 ад phen-i phen-i
17 отрицает neg neg
55 онмк phen-b phen-b
56 : phen-i phen-i
57 отрицает neg neg
10039_С2011.xml.xml
167 ангинозные phen-b phen-b
168 боли phen-i phen-i
169 , phen-i phen-i
170 наруш

100 рецидивирует phen-i phen-i
10282_С2014 10369_С2014.xml.xml
10282_С2015 11064_С2015.xml.xml
18 по phen-b phen-b
19 поводу phen-i phen-i
20 аг phen-i phen-i
21 не neg neg
22 обследована phen-i phen-i
67 к phen-b phen-b
68 врачам phen-i phen-i
69 не neg neg
70 обращалась phen-i phen-i
10295_С2014.xml.xml
10299_С2010 10868_С2010.xml.xml
90 к phen-b phen-b
91 врачу phen-i phen-i
92 не neg neg
93 обращался phen-i phen-i
10317_С2015.xml.xml
114 , phen-i phen-i
115 ба phen-i phen-i
116 , phen-i phen-i
117 заболевания phen-i phen-i
118 щж phen-i phen-i
119 : phen-i phen-i
120 отрицает neg neg
121 инфаркт phen-b phen-b
122 миокарда phen-i phen-i
123 : phen-i phen-i
124 отрицает neg neg
10357_С2013.xml.xml
52 постоянной phen-b phen-b
53 антигипертензивной phen-i phen-i
54 терапии phen-i phen-i
55 не neg neg
56 получает phen-i phen-i
68 оим phen-b phen-b
69 в phen-i phen-i
70 анамнезе phen-i phen-i
71 отрицает neg neg
10370_С2014.xml.xml
189 без neg neg
190 ухудшений phen-i phen-i
10398_С2010.