In [1]:
import string
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
try:
    from lxml import etree
except ImportError:
    print('lxml not found. xml.etree.ElementTree will be used')
    import xml.etree.ElementTree as etree

In [15]:
class DataParser:
    def __init__(self, limit):
        self._limit = limit
        
    def read_data(self):
        tagget_sentences = []
        missed = [33, 198, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 1139,1177, 1178, 1244,1386, 1612, 1639, 1640, 1641, 1642, 1643,1644,1645,1646,1647,1648]
        if self._limit > 4062:
            self._limit = 4062
        for i in range(self._limit):
            if(i in missed): #missing 34.xml
                continue
            try:
                filename = 'annot/{}.xml'.format(i+1)
                tagget_sentences.extend(self.read_xml('annot/{}.xml'.format(i+1)))
            except IOError:
                continue
        return tagget_sentences
    
    def read_xml(self, filename):
        tree = etree.parse(filename)
        root = tree.getroot() 
        paragraphs = tree.find('paragraphs')
        tagget_sentences = []
        for paragraph in paragraphs:
            for sentence in paragraph:
                tokens = sentence.find('tokens')
                sent = []
                for token in tokens:
                    word = token.attrib['text']
                    try:
                        pos = token.find('tfr').find('v').find('l').find('g').attrib['v']
                        sent.append((word, pos))
                    except:
                        continue
                tagget_sentences.append(sent)
        return tagget_sentences
    
class Tagger:
    def __init__(self, parser):
        self._parser = parser
        self._tt_data_limit = .75
        self._tagget_sentences = parser.read_data()
        self._clf = self._train_model()
    
    def _features(self, sentence, index):
        features = {}
        word = sentence[index]
        prevw = '' if index == 0 else sentence[index - 1]
        nextw = '' if index == len(sentence) - 1 else sentence[index + 1]
        return {
            'word': word,
            'is_first': index == 0,
            'is_last': index == len(sentence) - 1,
            'is_capitalized': word[0].upper() == word[0],
    #         'is_all_caps': word.upper() == word,
    #         'is_all_lower': word.lower() == word,
            'prefix-1': word[0],
            'prefix-2': word[:2],# some specific signs
            'suffix-1': word[-1],# y
            'suffix-2': word[-2:],#ed, ly
            'suffix-3': word[-3:],#ing
            'suffix-4': word[-4:],
    #         'digit_start': word[0] in '0123456789',
            'vowel_last': word[-1] in 'aeiouy',
            'vowels': ''.join([i for i in word if i in 'аоиеёэыуюя']),
            'is_punctuation': word in string.punctuation,
    #         'prev_word': prevw,
    #         'next_word': nextw,
            'latin': word[0] in 'abcdefghijklmnopqrstuvwxyz',
            'prev-suffix-1': prevw[-1:],
            'prev-suffix-2': prevw[-2:],
            'prev-suffix-3': prevw[-3:],
            'next-suffix-1': nextw[-1:],
            'next-suffix-2': nextw[-2:],
            'next-suffix-3': nextw[-3:],
    #         'len_gt_3': len(word) > 3,
    #         'is_numeric': word.isdigit(),
            'contains_hypen': '-' in word,
            'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
        }
    
    def _remove_tags(self, ts):
        return [w for (w, t) in ts] 
    
    def _transform_to_dataset(self, ts):
        X, y = [], []
        for tagged in ts:
            for index in range(len(tagged)):
                X.append(self._features(self._remove_tags(tagged), index))
                y.append(tagged[index][1])
        return X, y
    
    def _train_model(self):
        limit = (int)(len(self._tagget_sentences) * self._tt_data_limit)
        X, y = self._transform_to_dataset(self._tagget_sentences[:limit])
#         X_test, y_test = self._transform_to_dataset(self._tagget_sentences[limit:])
        clf = Pipeline([
            ('vectorizer', DictVectorizer(sparse=False)),
            ('classifier', DecisionTreeClassifier(criterion='entropy'))
        ])
        clf.fit(X, y)
        return clf
        
    def get_tags(self):
        tags = []
        tt = []
        sentence = []
        for word in self._tagget_sentences:
            for (x, y) in word:
                if y not in tags:
                    tags.append(y)
                    tt.append((x, y))
        return tt
    
    def get_score(self):
        limit = (int)(len(self._tagget_sentences) * self._tt_data_limit)
        X, y = self._transform_to_dataset(self._tagget_sentences[limit:])
        return self._clf.score(X, y)

    def get_report(self):
        limit = (int)(len(self._tagget_sentences) * self._tt_data_limit)
        X, y = self._transform_to_dataset(self._tagget_sentences[limit:])
        return classification_report(y, self._clf.predict(X))
    
    def pos_tag(self, sentences):
        X = []
        tagget = []
        for s in sentences:
            for index in range(len(s)):
                X.append(self._features(s, index))
            y = self._clf.predict(X)
            paired = []
            for i in range(len(s)):
                paired.append((s[i], y[i]))
            tagget.append(paired)
            X = []
        return tagget
    
    def print_mistakes(self):
        X, y = self._transform_to_dataset(self._tagget_sentences[limit:])
        y_pred = self._clf.predict(X)
        for m, n, k in zip([x['word'] for x in X], y_pred, y):
            if (n != k):
                print(m,'f:'+ n,'t:'+ k)

In [16]:
par = DataParser(20)
p = par.read_data()

In [17]:
ts = []
limit = (int)(len(p) * .75)

for s in p[limit:]:
    ts.extend(s)
print(len(ts))
print(len(p[:limit]))

print(len(p[limit:]))
print(len(p))
tags = []
t = []
sentence = []
for line in p:
    for w in line:
        y, x = w
        if x not in tags:
            tags.append(x)
            t.append((x,y))
#             print(line)
print(t)

8292
907
303
1210
[('PNCT', '«'), ('NOUN', 'Школа'), ('VERB', 'учит'), ('INFN', 'прикусить'), ('PRCL', 'ли'), ('PREP', 'в'), ('ADJF', 'новом'), ('NPRO', 'это'), ('ADVB', 'уже'), ('UNKN', 'ребрендинг'), ('PRED', 'можно'), ('CONJ', 'что'), ('COMP', 'дальше'), ('PRTF', 'появившихся'), ('NUMR', 'два'), ('INTJ', 'Однако'), ('PRTS', 'посвящен'), ('GRND', 'будучи'), ('ADJS', 'долго'), ('ROMN', 'XVII'), ('NUMB', '12'), ('LATN', 'deus'), ('SYMB', '+')]


In [18]:
def get_tagger(limit = 20):
    tagger = None
    try:
        with open('tagger.model{}.pkl'.format(limit), 'rb') as input:
            tagger = pickle.load(input)
    except:
        with open('tagger.model{}.pkl'.format(limit), 'wb') as output:
            tagger = Tagger(DataParser(limit))
            pickle.dump(tagger, output, pickle.HIGHEST_PROTOCOL)
    return tagger

In [19]:
tagger = get_tagger(20)
print(tagger.get_report())
print(tagger.get_score())
tagger.print_mistakes()

             precision    recall  f1-score   support

       ADJF       0.84      0.89      0.86      1021
       ADJS       0.70      0.41      0.52        34
       ADVB       0.83      0.80      0.82       198
       COMP       0.38      0.75      0.50         4
       CONJ       0.97      0.95      0.96       605
       GRND       0.60      0.19      0.29        16
       INFN       0.93      0.98      0.95       122
       INTJ       0.78      0.70      0.74        20
       LATN       0.67      0.74      0.71        94
       NOUN       0.88      0.93      0.90      2218
       NPRO       0.71      0.78      0.74       129
       NUMB       0.92      0.90      0.91        79
       NUMR       0.73      0.48      0.58        23
       PNCT       0.99      0.99      0.99      1959
       PRCL       0.95      0.92      0.93       195
       PRED       1.00      1.00      1.00         5
       PREP       0.91      0.97      0.94       715
       PRTF       0.64      0.59      0.62   

• f:PREP t:PNCT
Кодзасов f:NOUN t:UNKN
Кривнова f:NOUN t:UNKN
с f:ADJF t:NOUN
ISBN f:NOUN t:LATN
• f:PREP t:PNCT
учебник f:UNKN t:NOUN
студ f:NOUN t:UNKN
филол f:VERB t:UNKN
лингв f:NOUN t:UNKN
фак f:CONJ t:UNKN
высш f:NOUN t:UNKN
6-е f:NOUN t:ADJF
стер f:NOUN t:UNKN
факультет f:VERB t:NOUN
с f:ADJF t:NOUN
ISBN f:NOUN t:LATN
• f:PREP t:PNCT
Реформатский f:ADJF t:NOUN
Из f:PNCT t:PREP
Из f:PNCT t:PREP
• f:PREP t:PNCT
Изд-во f:UNKN t:NOUN
АН f:UNKN t:NOUN
В f:NOUN t:PREP
5 f:NOUN t:NUMB
IV f:LATN t:ROMN
• f:PREP t:PNCT
Избранные f:NOUN t:ADJF
Учпедгиз f:NOUN t:UNKN
Павичем f:CONJ t:UNKN
Павич f:NOUN t:UNKN
работал f:NOUN t:VERB
Париже f:CONJ t:NOUN
Фрайбурге f:NOUN t:UNKN
Владел f:NOUN t:VERB
Павич f:NOUN t:UNKN
1967-м f:NUMB t:UNKN
Я f:PREP t:NPRO
спустя f:NOUN t:PREP
Илиадой f:ADJF t:UNKN
главное f:ADJF t:CONJ
проще f:ADVB t:COMP
его f:ADJF t:NPRO
глав f:GRND t:NOUN
Павича f:NOUN t:UNKN
Ящик f:UNKN t:NOUN
Однако f:INTJ t:CONJ
помня f:NOUN t:GRND
многом f:NOUN t:ADJF
сюжет f:VERB t:NOUN