# Введение в обработку естественного языка
## Урок 5. Part-of-Speech разметка, NER, извлечение отношений

Задание 1. Написать теггер на данных с русским языком
проверить UnigramTagger, BigramTagger, TrigramTagger и их комбинации
написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
сравнить все реализованные методы, сделать выводы  

In [None]:
!pip install pyconll

In [None]:
import nltk
nltk.download('tagsets')

from nltk.tokenize import word_tokenize
import matplotlib
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger

import pyconll

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
full_train = pyconll.load_from_file('dataset_ru/ru_syntagrus-ud-train-a.conllu')
full_train_b = pyconll.load_from_file('dataset_ru/ru_syntagrus-ud-train-b.conllu')
full_train_c = pyconll.load_from_file('dataset_ru/ru_syntagrus-ud-train-c.conllu')

# Общая обучающая выборка
full_train.extend([*full_train_b, *full_train_c])

full_test = pyconll.load_from_file('dataset_ru/ru_syntagrus-ud-dev.conllu')

In [None]:
for sent in full_test[:1]:
    for token in sent:
        print(token.form, token.upos)
    print()

In [None]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [None]:
unigram_tagger = UnigramTagger(fdata_train)
display(unigram_tagger.tag(fdata_sent_test[50]), unigram_tagger.evaluate(fdata_test))

In [None]:
bigram_tagger = BigramTagger(fdata_train)
display(bigram_tagger.tag(fdata_sent_test[50]), bigram_tagger.evaluate(fdata_test))

In [None]:
trigram_tagger = TrigramTagger(fdata_train)
display(trigram_tagger.tag(fdata_sent_test[50]), trigram_tagger.evaluate(fdata_test))

In [None]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


# В качестве бэкофф опции будем использовать тэг существительного
backoff = DefaultTagger('NOUN') 
tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.evaluate(fdata_test) 

In [None]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        if (tok[0] is None) or (tok[1] is None):
            continue
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        if (tok[0] is None) or (tok[1] is None):
            continue
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [None]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)

In [None]:
le.classes_

In [None]:
hvectorizer = HashingVectorizer(ngram_range=(2, 15), analyzer='char', n_features=65536)
tvectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
cvectorizer = CountVectorizer(ngram_range=(2, 13), analyzer='char')

In [None]:
Xh_train = hvectorizer.fit_transform(train_tok)
Xh_test = hvectorizer.transform(test_tok)

Xt_train = tvectorizer.fit_transform(train_tok)
Xt_test = tvectorizer.transform(test_tok)

Xc_train = cvectorizer.fit_transform(train_tok)
Xc_test = cvectorizer.transform(test_tok)

In [None]:
%%time
lr = LogisticRegression(random_state=42, max_iter=500)
lr.fit(Xh_train, train_enc_labels)
pred = lr.predict(Xh_test)
print(f'Accuracy на основе HashingVectorizer - {accuracy_score(test_enc_labels, pred):.4f}.')

In [None]:
%%time
lr = LogisticRegression(random_state=42, max_iter=500)
lr.fit(Xt_train, train_enc_labels)
pred = lr.predict(Xt_test)
print(f'Accuracy на основе TfidfVectorizer - {accuracy_score(test_enc_labels, pred):.4f}.')

In [None]:
%%time
lr = LogisticRegression(random_state=42, max_iter=500)
lr.fit(Xc_train, train_enc_labels)
pred = lr.predict(Xc_test)
print(f'Accuracy на основе CountVectorizer - {accuracy_score(test_enc_labels, pred):.4f}.')

Как видим наилучшая точность получилась среди векторайзеров получилась для модели CountVectorizer на основе букв

Задание 2. Проверить, насколько хорошо работает NER
Данные брать из Index of /pub/named_entities
проверить NER из nltk/spacy/deeppavlov.
написать свой NER, попробовать разные подходы.
передаём в сетку токен и его соседей.
передаём в сетку только токен.
свой вариант.
сравнить свои реализованные подходы на качество — вывести precision/recall/f1_score.

In [None]:
!pip install natasha corus
!pip -q install spacy
!python -m spacy download ru_core_news_md

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk

nltk.download('averaged_perceptron_tagger_ru')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('tagsets')

from corus import load_ne5
from razdel import tokenize
from sklearn.metrics import classification_report

In [None]:
dir = 'Collection5/'
records = load_ne5(dir)
next(records)

In [None]:
records = load_ne5(dir)
for ix, rec in enumerate(records):
  print(rec.text)
  print('\nИменованные сущности:')
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(rec.text), lang='rus')):
    if hasattr(chunk, 'label'):
      print(f'{chunk} - {chunk.label()}')
  break

In [None]:
docs = []
for rec in records:
    words = []
    labels = []
    idx_ent = -1
    len_ents = len(rec.spans)
    rec_entities = sorted(rec.spans, key=lambda v: v.start)
    ent = None
    is_start = None
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        if len_ents == 0:
            words.append(token.text)
            labels.append(type_ent)
            continue

        if (idx_ent == -1) or (idx_ent + 1 < len_ents and token.start > ent.stop):
            idx_ent += 1
            ent = rec_entities[idx_ent]
            is_start = True

        if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = 'B-' + ent.type if is_start else 'I-' + ent.type
                is_start = False
        words.append(token.text)
        labels.append(type_ent)
    
    docs.append([words, labels])

In [None]:
print(docs[0][0])
print(docs[0][1])

In [None]:
training_coeff = 0.75

with open('c5.bio', 'w') as w:
    with open('c5_train.bio', 'w') as w1:
        with open('c5_valid.bio', 'w') as w2:
            for irec, rec in enumerate(docs):
                for line in map(lambda vl: '\t'.join(vl) + '\n', zip(*rec)):
                    w.write(line)
                    if irec < len(docs) * training_coeff:
                        w1.write(line)
                    else:
                        w2.write(line)
                w.write('\n')
                if irec < len(docs) * training_coeff:
                    w1.write('\n')
                else:
                    w2.write('\n')

In [None]:
!python -m spacy init config base_config.cfg -F -p  ner -l ru
!python -m spacy init fill-config base_config.cfg config.cfg
# !python -m spacy convert c5.bio . -t json -c ner
!python -m spacy convert c5_train.bio . -c ner
!python -m spacy convert c5_valid.bio . -c ner

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train c5_train.spacy --paths.dev c5_valid.spacy

In [None]:
!python -m spacy evaluate output/model-last c5_valid.spacy

In [None]:
!pip install deeppavlov
!python -m deeppavlov install squad_bert
!python -m deeppavlov install ner_ontonotes
!pip install transformers

In [None]:
import deeppavlov
from deeppavlov import configs, build_model

ner_model = build_model(configs.ner.ner_few_shot_ru, download=True)

In [None]:
from deeppavlov.core.commands.utils import parse_config
config_dict = parse_config(configs.ner.ner_few_shot_ru)
print(config_dict['dataset_reader']['data_path'])

In [None]:
with open('ner_few_shot_data/all.txt', 'w') as w:
    with open('ner_few_shot_data/train.txt', 'w') as w1:
        with open('ner_few_shot_data/valid.txt', 'w') as w2:
          with open('ner_few_shot_data/test.txt', 'w') as w3:
            for irec, rec in enumerate(docs):
                for line in map(lambda vl: '\t'.join(vl) + '\n', zip(*rec)):
                    w.write(line)
                    if irec < 40:
                        w1.write(line)
                    elif irec < 45:
                        w2.write(line)
                    elif irec < 50:
                        w3.write(line)
                w.write('\n')
                if irec < 40:
                    w1.write(line)
                elif irec < 45:
                    w2.write(line)
                elif irec < 50:
                    w3.write(line)
     

In [None]:
!python -m deeppavlov train ner_few_shot_ru

In [None]:
from deeppavlov import train_model
ner_model = train_model(configs.ner.ner_few_shot_ru)

In [None]:
y_t = [docs[i][1] for i in range(45, 50)]
y_true = [item for sublist in y_t for item in sublist]

In [None]:
np.unique(np.array(y_true))

In [None]:
y_p = []
for i in range(45, 50):
  y_p.append(ner_model(docs[i][0])[1])

y_pred = [item for sublist in y_p for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]

In [None]:
np.unique(np.array(y_pred))

In [None]:
y_true = np.array(y_true)
y_true[np.char.endswith(y_true, 'MEDIA')]='MEDIA'
y_true[np.char.endswith(y_true, 'GEOPOLIT')]='GEOPOLIT'
y_true[np.char.endswith(y_true, 'LOC')]='LOC'
y_true[np.char.endswith(y_true, 'PER')]='PER'
y_true[np.char.endswith(y_true, 'ORG')]='ORG'
y_true[np.char.endswith(y_true, 'OUT')]='OUT'
np.unique(y_true)

In [None]:
y_pred = np.array(y_pred)
y_pred[np.char.endswith(y_pred, 'MEDIA')]='MEDIA'
y_pred[np.char.endswith(y_pred, 'GEOPOLIT')]='GEOPOLIT'
y_pred[np.char.endswith(y_pred, 'LOC')]='LOC'
y_pred[np.char.endswith(y_pred, 'PER')]='PER'
y_pred[np.char.endswith(y_pred, 'ORG')]='ORG'
y_pred[np.char.endswith(y_pred, 'OUT')]='OUT'
np.unique(y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, GRU, LSTM, Dropout, Input
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn import model_selection, preprocessing, linear_model

In [None]:
dir = 'Collection5/'
records = load_ne5(dir)
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append([token.text, type_ent])
    words_docs.extend(words)

In [None]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])
df_words['tag'].value_counts()

In [None]:
df_words.head(3)

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_words['word'], df_words['tag'])

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def custom_standardization(input_data):
    return input_data

vocab_size = 30000
seq_len = 10

# без соседних токенов 
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    # ngrams=(1, 3),
    output_sequence_length=seq_len)


# с соседними токенами
vectorize_layer_n13 = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    ngrams=(1, 3),
    output_sequence_length=seq_len)

vectorize_layer_n4 = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    ngrams=4,
    output_sequence_length=seq_len)

text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [None]:
embedding_dim = 128

modeln = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim),
    Conv1D(embedding_dim, 3),
    Conv1D(embedding_dim, 2),
    GRU(350),
    Dense(200, activation='relu'),
    Dense(6, activation='softmax')
])

In [None]:
modeln.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
modeln.fit(train_data, validation_data=valid_data, epochs=3)

In [None]:
vectorize_layer_n13.adapt(text_data)

modeln13 = Sequential([
    vectorize_layer_n13,
    Embedding(vocab_size, embedding_dim),
    Conv1D(embedding_dim, 3),
    Conv1D(embedding_dim, 2),
    GRU(350),
    Dense(200, activation='relu'),
    Dense(6, activation='softmax')
])

modeln13.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

modeln13.fit(train_data, validation_data=valid_data, epochs=3)

In [None]:
vectorize_layer_n4.adapt(text_data)

modeln4 = Sequential([
    vectorize_layer_n4,
    Embedding(vocab_size, embedding_dim),
    Conv1D(embedding_dim, 3),
    Conv1D(embedding_dim, 2),
    GRU(350),
    Dense(200, activation='relu'),
    Dense(6, activation='softmax')
])

modeln4.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

modeln4.fit(train_data, validation_data=valid_data, epochs=3)

In [None]:
labels_predict_n = modeln.predict(valid_data)
labels_predict_n13 = modeln13.predict(valid_data)
labels_predict_n4 = modeln4.predict(valid_data)

In [None]:
class_preds_n = np.argmax(tf.nn.softmax(labels_predict_n), axis=1)
class_preds_n13 = np.argmax(tf.nn.softmax(labels_predict_n13), axis=1)
class_preds_n4 = np.argmax(tf.nn.softmax(labels_predict_n4), axis=1)

In [None]:
valid_y = encoder.inverse_transform(valid_y)
class_preds_n = encoder.inverse_transform(class_preds_n)
class_preds_n13 = encoder.inverse_transform(class_preds_n13)
class_preds_n4 = encoder.inverse_transform(class_preds_n4)

In [None]:
print(classification_report(valid_y, class_preds_n))

In [None]:
print(classification_report(valid_y, class_preds_n13))

In [None]:
print(classification_report(valid_y, class_preds_n4))

модель построенная с помощью библиотеки spacy показала самые лучшие результаты