In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger

import pyconll

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
full_train = pyconll.load_from_file('ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('ru_syntagrus-ud-dev.conllu')

In [3]:
for sent in full_train[:2]:
    for token in sent:
        print(token.form, token.upos)
    print()

Снова ADV
приобрел VERB
дозу NOUN
, PUNCT

В ADP
женщине NOUN
важна ADJ
верность NOUN
, PUNCT
а CCONJ
не PART
красота NOUN
. PUNCT



In [4]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [5]:
MAX_SENT_LEN = max(len(sent) for sent in full_train)
MAX_ORIG_TOKEN_LEN = max(len(token.form) for sent in full_train for token in sent)
print('Наибольшая длина предложения', MAX_SENT_LEN)
print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN)

Наибольшая длина предложения 274
Наибольшая длина токена 161


In [6]:
all_train_texts = [' '.join(token.form for token in sent) for sent in full_train]
all_test_texts = [' '.join(token.form for token in sent) for sent in full_test]

all_train_labels = [' '.join(token.form for token in sent) for sent in full_train]
all_test_labels = [' '.join(token.form for token in sent) for sent in full_test]
print('\n'.join(all_train_texts[:10]))

Снова приобрел дозу ,
В женщине важна верность , а не красота .
Важно то , о чем ты думаешь и какие поступки совершаешь .
Уже не та на лоб спадает челка ...
Но ты не живешь по-евангельски , и это — причина твоих проблем .
Ведь этот цветок цветёт для меня !
И как же больно было нам ,
Как свет добра струился с глаз !
Их отличает харизма , приятная внешность , живой аналитический ум , хорошее воспитание и манеры , за которыми всегда спрятан сильный взгляд , в котором едва просматривается небольшая грусть .
Увы , не понимаю ... Где эта жизнь , которая моя .


In [7]:
bigram_tagger = BigramTagger(fdata_train, backoff=UnigramTagger(fdata_train))

In [8]:
display(bigram_tagger.tag(fdata_sent_test[100]), bigram_tagger.evaluate(fdata_test))

[('Не', 'PART'), ('могу', 'VERB'), ('найти', 'VERB'), ('воробья', None)]

0.6859152139461173

In [9]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)

In [12]:
test_enc_labels = le.transform(test_label)

In [13]:
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
       'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'],
      dtype='<U5')

In [24]:
hvectorizer = HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=300)

In [25]:
X_train = hvectorizer.fit_transform(train_tok)

In [26]:
X_test = hvectorizer.transform(test_tok)

In [27]:
X_train.shape

(176631, 300)

In [28]:
lr = LogisticRegression(random_state=0, max_iter=1000)
lr.fit(X_train, train_enc_labels)

LogisticRegression(max_iter=1000, random_state=0)

In [29]:
pred = lr.predict(X_test)

In [30]:
accuracy_score(test_enc_labels, pred)

0.731378763866878

In [31]:
hvectorizer = HashingVectorizer(ngram_range=(1, 5), analyzer='char', n_features=300)

In [32]:
X_train = hvectorizer.fit_transform(train_tok)
X_test = hvectorizer.transform(test_tok)
X_train.shape

(176631, 300)

In [33]:
lr = LogisticRegression(random_state=0, max_iter=1000)
lr.fit(X_train, train_enc_labels)

LogisticRegression(max_iter=1000, random_state=0)

In [34]:
pred = lr.predict(X_test)

In [35]:
accuracy_score(test_enc_labels, pred)

0.7138470681458003

In [37]:
hvectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char')
X_train = hvectorizer.fit_transform(train_tok)
X_test = hvectorizer.transform(test_tok)
lr.fit(X_train, train_enc_labels)
pred = lr.predict(X_test)
accuracy_score(test_enc_labels, pred)

0.8820324881141046

In [39]:
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [40]:
hvectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char')
X_train = hvectorizer.fit_transform(train_tok)
X_test = hvectorizer.transform(test_tok)
xgb_cl = XGBClassifier(learning_rate=0.1, max_depth=28, n_estimators=150, random_state=0)
xgb_cl.fit(X_train, train_enc_labels)
pred = xgb_cl.predict(X_test)
accuracy_score(test_enc_labels, pred)





0.8974841521394612