<a href="https://colab.research.google.com/github/Vakhranev/Compling/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [0]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [39]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [0]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [0]:
from sklearn.model_selection import train_test_split
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2)

In [0]:
from collections import Counter, Iterable
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [0]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [0]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [0]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [0]:
char2id = {'UNK': 1, 'PAD':0}
chars = set(chain(*chain(*sent_train)))
chars = {ch.lower() for ch in chars}
for ch in chars:
  if ch.lower not in char2id:
    char2id[ch.lower()] = len(char2id)
id2char = {i:ch for ch, i in char2id.items()}

In [0]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data

In [0]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)

In [0]:
MAX_LEN = min(max(len(x) for x in sent_train), 120)

In [0]:
from itertools import chain
max_char_len = [[len(word) for word in sent] for sent in sent_train]
max_char_len = max(chain(*max_char_len))
max_char_len = min(max_char_len, 16)

In [0]:
X_char_train_ids = [[[char2id.get(ch.lower(), 1) for ch in word] for word in sent] for sent in sent_train]
X_char_test_ids = [[[char2id.get(ch.lower(), 1) for ch in word] for word in sent] for sent in sent_test]

In [0]:
X_char_train = pad_sequences([pad_sequences(ids, maxlen=max_char_len, padding='post') for ids in X_char_train_ids], maxlen=MAX_LEN, padding='post')
X_char_test = pad_sequences([pad_sequences(ids, maxlen=max_char_len, padding='post') for ids in X_char_test_ids], maxlen=MAX_LEN, padding='post')

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [0]:
from tensorflow.keras.utils import to_categorical
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, InputLayer, Embedding, Conv1D, Input, Flatten, concatenate
from tensorflow.keras.models import Model
model = Sequential()

word_in = Input(shape=(MAX_LEN,))
emb_word = Embedding(len(word2id), 60, mask_zero=True)(word_in)
word_enc = Bidirectional(LSTM(256, return_sequences=True))(emb_word)

char_in = Input(shape=(MAX_LEN, max_char_len))
emb_char = TimeDistributed(Embedding(len(char2id), 8, input_length=max_char_len, mask_zero=False))(char_in)
conv_char = TimeDistributed(Conv1D(128, 3, 1, padding='same'))(emb_char)
flat_char = TimeDistributed(Flatten())(conv_char)

x = concatenate([word_enc, flat_char])
main_lstm = Bidirectional(LSTM(128, return_sequences=True))(x)
out = TimeDistributed(Dense(len(tag2id), activation='softmax'))(main_lstm)
model = Model([word_in, char_in], out)
model.compile(loss='categorical_crossentropy', optimizer='Adam', 
                   metrics=['accuracy'])

In [78]:
model.fit([X_train, X_char_train], y_train, validation_data=([X_test, X_char_test], y_test), batch_size=128, epochs=20, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f0797fc24a8>

In [0]:
from nltk.tokenize import word_tokenize

In [0]:
def tagger(sent):
  if isinstance(sent, str):
    sent = word_tokenize(sent)
  if not isinstance(sent, Iterable):
    raise TypeError('Not a string')
  else:
    sent_len = len(sent)
    char_s = [[[char2id.get(char.lower(), 1) for char in word] for word in sent_p] for sent_p in [sent]]
    char_s = pad_sequences([pad_sequences(ids, maxlen=max_char_len, padding='post') for ids in char_s], maxlen=MAX_LEN, padding='post')
    word_s = [word2id.get(word.lower(), 1) for word in sent]
    word_s = pad_sequences([word_s], maxlen=MAX_LEN, padding='post')
    prediction = model.predict([word_s, char_s])
    prediction = [id2tag[tag] for tag in np.argmax(prediction, axis=2)[0, :sent_len]]
    return list(zip(sent, prediction))

In [87]:
import numpy as np
tagger(sent_train[np.random.randint(0, len(sent_test) + 1)])

[('The', 'dt'),
 ('Labor', 'nnp'),
 ('Department', 'nnp'),
 ('cited', 'nnp'),
 ('USX', 'nnp'),
 ('Corp.', 'nnp'),
 ('for', 'in'),
 ('numerous', 'jj'),
 ('health', 'nn'),
 ('and', 'cc'),
 ('safety', 'nn'),
 ('violations', 'nns'),
 ('at', 'in'),
 ('two', 'cd'),
 ('Pennsylvania', 'jj'),
 ('plants', 'nns'),
 (',', ','),
 ('and', 'cc'),
 ('proposed', 'vbd'),
 ('$', '$'),
 ('7.3', 'cd'),
 ('million', 'cd'),
 ('*U*', '-none-'),
 ('in', 'in'),
 ('fines', 'nns'),
 (',', ','),
 ('the', 'dt'),
 ('largest', 'jjs'),
 ('penalty', 'nn'),
 ('ever', 'rb'),
 ('proposed', 'vbn'),
 ('*', '-none-'),
 ('for', 'in'),
 ('alleged', 'vbn'),
 ('workplace', 'nn'),
 ('violations', 'nns'),
 ('by', 'in'),
 ('an', 'dt'),
 ('employer', 'nn'),
 ('.', '.')]