№1

In [1]:
!pip install pyconll



In [2]:
import nltk
from nltk.tokenize import word_tokenize
import matplotlib
%matplotlib inline

from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

import corus
import pandas as pd
import numpy as np

import pyconll

In [3]:
import wget
url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-train.conllu'
train = wget.download(url)
print(train)

ru_taiga-ud-train (1).conllu


In [4]:
import wget
url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-dev.conllu'
test = wget.download(url)
print(test)

ru_taiga-ud-dev (1).conllu


In [5]:
# Общая обучающая выборка
full_train = pyconll.load_from_file('ru_taiga-ud-train.conllu')
full_test = pyconll.load_from_file('ru_taiga-ud-dev.conllu')

In [6]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [7]:
comparing_list = []

In [8]:
%%time

default_tagger = DefaultTagger('NOUN')

# display(default_tagger.tag(fdata_sent_test[100]))
display(default_tagger.evaluate(fdata_test))

comparing_list.append(('default_tagger', default_tagger.evaluate(fdata_test)))

0.24167987321711568

CPU times: total: 15.6 ms
Wall time: 10 ms


In [9]:
%%time

unigram_tagger = UnigramTagger(fdata_train)

# display(unigram_tagger.tag(fdata_sent_test[100]))
display(unigram_tagger.evaluate(fdata_test))

comparing_list.append(('unigram_tagger', unigram_tagger.evaluate(fdata_test)))

0.6831418383518225

CPU times: total: 250 ms
Wall time: 240 ms


In [10]:
%%time

bigram_tagger = BigramTagger(fdata_train, backoff=unigram_tagger)

# display(bigram_tagger.tag(fdata_sent_test[100]))
display(bigram_tagger.evaluate(fdata_test))

comparing_list.append(('bigram_tagger', bigram_tagger.evaluate(fdata_test)))

0.6859152139461173

CPU times: total: 359 ms
Wall time: 361 ms


In [11]:
%%time

trigram_tagger = TrigramTagger(fdata_train, backoff=bigram_tagger)

# display(trigram_tagger.tag(fdata_sent_test[100]))
display(trigram_tagger.evaluate(fdata_test))

comparing_list.append(('trigram_tagger', trigram_tagger.evaluate(fdata_test)))

0.6867076069730587

CPU times: total: 484 ms
Wall time: 478 ms


In [12]:
from nltk.tag import TrigramTagger 

def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

backoff = DefaultTagger('NOUN')

In [13]:
%%time

tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
display(tag.evaluate(fdata_test))

comparing_list.append(('U_B_T', tag.evaluate(fdata_test)))

0.785756735340729

CPU times: total: 1.19 s
Wall time: 1.21 s


In [14]:
%%time

tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger],  
                     backoff = backoff) 
  
display(tag.evaluate(fdata_test))

comparing_list.append(('U_B', tag.evaluate(fdata_test)))

0.7847662440570523

CPU times: total: 609 ms
Wall time: 613 ms


In [15]:
%%time

tag = backoff_tagger(fdata_train,  
                     [BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
display(tag.evaluate(fdata_test))

comparing_list.append(('B_T', tag.evaluate(fdata_test)))

0.713648969889065

CPU times: total: 1 s
Wall time: 998 ms


In [16]:
%%time

tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
display(tag.evaluate(fdata_test))

comparing_list.append(('U_T', tag.evaluate(fdata_test)))

0.7859548335974643

CPU times: total: 625 ms
Wall time: 621 ms


In [17]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        if (tok[0] is None) or (tok[1] is None):
            continue
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        if (tok[0] is None) or (tok[1] is None):
            continue
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [18]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)

In [19]:
test_enc_labels = le.transform(test_label)

COUNT VECTORIZER

In [20]:
cvtr = CountVectorizer(ngram_range=(1, 3), analyzer='char')

X_train = cvtr.fit_transform(train_tok)
X_test = cvtr.transform(test_tok)

lr = LogisticRegression(random_state=0, n_jobs=8, max_iter=20)
lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)
display(accuracy_score(test_enc_labels, pred))

comparing_list.append(('count_vectorizer', accuracy_score(test_enc_labels, pred)))

0.7794175911251982

HASHING VECTORIZER

In [21]:
hvtr = HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=100)

X_train = hvtr.fit_transform(train_tok)
X_test = hvtr.transform(test_tok)

lr = LogisticRegression(random_state=0, n_jobs=8, max_iter=20)
lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)
display(accuracy_score(test_enc_labels, pred))

comparing_list.append(('hashing_vectorizer', accuracy_score(test_enc_labels, pred)))

0.5879556259904913

TFIDF VECTORIZER

In [22]:
tfvtr = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')

X_train = tfvtr.fit_transform(train_tok)
X_test = tfvtr.transform(test_tok)

lr = LogisticRegression(random_state=0, n_jobs=8, max_iter=20)
lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)
display(accuracy_score(test_enc_labels, pred))

comparing_list.append(('tfidf_vectorizer', accuracy_score(test_enc_labels, pred)))

0.7106774960380349

In [23]:
comparing_list

[('default_tagger', 0.24167987321711568),
 ('unigram_tagger', 0.6831418383518225),
 ('bigram_tagger', 0.6859152139461173),
 ('trigram_tagger', 0.6867076069730587),
 ('U_B_T', 0.785756735340729),
 ('U_B', 0.7847662440570523),
 ('B_T', 0.713648969889065),
 ('U_T', 0.7859548335974643),
 ('count_vectorizer', 0.7794175911251982),
 ('hashing_vectorizer', 0.5879556259904913),
 ('tfidf_vectorizer', 0.7106774960380349)]

In [24]:
pd.DataFrame(comparing_list, columns=['tagger', 'accuracy']).sort_values(by='accuracy', ascending=False)

Unnamed: 0,tagger,accuracy
7,U_T,0.785955
4,U_B_T,0.785757
5,U_B,0.784766
8,count_vectorizer,0.779418
6,B_T,0.713649
10,tfidf_vectorizer,0.710677
3,trigram_tagger,0.686708
2,bigram_tagger,0.685915
1,unigram_tagger,0.683142
9,hashing_vectorizer,0.587956
