In [40]:
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
from keras.utils.np_utils import to_categorical


In [41]:
import nltk
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     /home/vishisht/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to /home/vishisht/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /home/vishisht/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/vishisht/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [42]:
treebank_corpus = list(treebank.tagged_sents(tagset='universal')) # Tagset universal mean universal standard for naming tags
brown_corpus = list(brown.tagged_sents(tagset='universal'))
conll_corpus = list(conll2000.tagged_sents(tagset='universal'))

In [43]:
# For info about the tags https://universaldependencies.org/u/pos/
# All unique tags for the datasets
# For treebank
tree_bank_tags = []
for sen in treebank_corpus:
    for word in sen:
        tree_bank_tags.append(word[1])
tree_bank_tags = set(sorted(tree_bank_tags))
print("Treebank tags are", tree_bank_tags)


# For brown
brown_tags = []
for sen in brown_corpus:
    for word in sen:
        brown_tags.append(word[1])
brown_tags = set(sorted(brown_tags))
print("brown tags are", brown_tags)

# For conll2000
conll_tags = []
for sen in conll_corpus:
    for word in sen:
        conll_tags.append(word[1])
conll_tags = set(sorted(conll_tags))
print("conll tags are", conll_tags)

Treebank tags are {'ADV', 'ADJ', 'NUM', 'ADP', 'NOUN', '.', 'DET', 'CONJ', 'X', 'VERB', 'PRON', 'PRT'}
brown tags are {'ADV', 'ADJ', 'NUM', 'ADP', 'NOUN', '.', 'DET', 'CONJ', 'X', 'VERB', 'PRON', 'PRT'}
conll tags are {'ADV', 'ADJ', 'NUM', 'ADP', 'NOUN', '.', 'DET', 'CONJ', 'X', 'VERB', 'PRON', 'PRT'}


In [82]:
#The full corpus
final_dataset = treebank_corpus + brown_corpus + conll_corpus
# final_dataset = treebank_corpus
len(final_dataset)

72202

In [83]:
#Replacing '.' with PUNCT
#Changing list of tuples to list of lists for editing purposes

for sen in range(len(final_dataset)):
    for word in range(len(final_dataset[sen])):
        final_dataset[sen][word] = list(final_dataset[sen][word])
        if final_dataset[sen][word][1] == '.':
            final_dataset[sen][word][1] = 'PUNCT'
        final_dataset[sen][word] = tuple(final_dataset[sen][word])

        

In [84]:
#Performing Test train Split
train_dataset, rem_dataset,  = train_test_split(final_dataset, train_size=0.8)

In [85]:
test_dataset, val_dataset = train_test_split(rem_dataset, train_size = 0.5)

In [86]:
train_len = len(train_dataset)
test_len = len(test_dataset)
val_len = len(val_dataset)

In [87]:
print(train_len, test_len, val_len)

57761 7220 7221


In [88]:
# Extracing all the tags

dataset_tags = []
for sen in final_dataset:
    for word in sen:
        dataset_tags.append(word[1])
dataset_tags = sorted(set(sorted(dataset_tags)))
print("Final dataset tags are", dataset_tags)

Final dataset tags are ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'PUNCT', 'VERB', 'X']


In [89]:
# Check about -PAD- later on why it's exactly done
# Mapping tag to index for processing later
tag_to_index = {}
index_to_tag = {}
idx = 1
tag_to_index['-PAD-'] = 0
index_to_tag[0] = '-PAD-'
for tag in dataset_tags:
    tag_to_index[tag] = idx
    index_to_tag[idx] = tag
    idx += 1
print(tag_to_index)
print()
print(index_to_tag)
n_tags = len(tag_to_index)

{'-PAD-': 0, 'ADJ': 1, 'ADP': 2, 'ADV': 3, 'CONJ': 4, 'DET': 5, 'NOUN': 6, 'NUM': 7, 'PRON': 8, 'PRT': 9, 'PUNCT': 10, 'VERB': 11, 'X': 12}

{0: '-PAD-', 1: 'ADJ', 2: 'ADP', 3: 'ADV', 4: 'CONJ', 5: 'DET', 6: 'NOUN', 7: 'NUM', 8: 'PRON', 9: 'PRT', 10: 'PUNCT', 11: 'VERB', 12: 'X'}


In [90]:
# Some statistics
max_sen_len = 0
avg_sen_len = 0
for sen in train_dataset:
    max_sen_len = max(max_sen_len, len(sen))
    avg_sen_len += len(sen)
print("Maximum Sentence Length = ", max_sen_len)
print("Average Sentence Length = ", avg_sen_len/len(train_dataset))

Maximum Sentence Length =  271
Average Sentence Length =  21.065442080296393


In [91]:
MAX_SEQUENCE_LENGTH = 70

In [92]:
def split(sentences, max_len):
    new=[]
    for sen in sentences:
        new.append([sen[x:x+max_len] for x in range(0, len(sen), max_len)])
    new = [sen for sublist in new for sen in sublist]
    return new

In [93]:
train_dataset = split(train_dataset, MAX_SEQUENCE_LENGTH)
test_dataset = split(test_dataset, MAX_SEQUENCE_LENGTH)
val_dataset = split(val_dataset, MAX_SEQUENCE_LENGTH)

In [94]:
# After reducing sentence size
max_sen_len = 0
avg_sen_len = 0
for sen in train_dataset:
    max_sen_len = max(max_sen_len, len(sen))
    avg_sen_len += len(sen)
print("Maximum Sentence Length = ", max_sen_len)
print("Average Sentence Length = ", avg_sen_len/len(train_dataset))    

Maximum Sentence Length =  70
Average Sentence Length =  20.97899963792479


In [95]:
def get_tags(sentences):
    return [[t for w, t in sentence] for sentence in sentences]

def get_words(sentences):
    return [[w for w, t in sentence] for sentence in sentences]

In [96]:
train_text = get_words(train_dataset)
test_text = get_words(test_dataset)

train_tag = get_tags(train_dataset)
test_tag = get_tags(test_dataset)

In [116]:
print(train_text[0])
print(train_tag[0])

['These', 'my', 'grandmother', 'left', 'in', 'their', 'places', '(', 'they', 'are', 'still', 'there', ',', 'more', 'persistent', 'and', 'longer-lived', 'than', 'the', 'generations', 'of', 'man', ')', 'and', 'planted', 'others', 'like', 'them', ',', 'that', 'flourished', 'without', 'careful', 'tending', '.']
['DET', 'DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'ADV', 'ADV', 'PUNCT', 'ADV', 'ADJ', 'CONJ', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'CONJ', 'VERB', 'NOUN', 'ADP', 'PRON', 'PUNCT', 'PRON', 'VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT']


In [97]:
# Making it BERT preprocessable input
train_text_new = []
for sent in train_text:
    train_text_new.append(' '.join(sent))
    
test_text_new = []
mx = 0
for sent in test_text:
    test_text_new.append(' '.join(sent))

In [117]:
print(train_text_new[0])

These my grandmother left in their places ( they are still there , more persistent and longer-lived than the generations of man ) and planted others like them , that flourished without careful tending .


In [118]:
def bert_labels(labels):
    train_label_bert = []
    train_label_bert.append('-PAD-')
    for i in labels:
        train_label_bert.append(i)
    train_label_bert.append('-PAD-')
    return train_label_bert

In [99]:
train_tag_padded = []
for tags in train_tag:
    train_tag_padded.append(bert_labels(tags))
    
test_tag_padded = []
for tags in test_tag:
    test_tag_padded.append(bert_labels(tags))


In [119]:
print(train_tag_padded[0])

['-PAD-', 'DET', 'DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'ADV', 'ADV', 'PUNCT', 'ADV', 'ADJ', 'CONJ', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'CONJ', 'VERB', 'NOUN', 'ADP', 'PRON', 'PUNCT', 'PRON', 'VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT', '-PAD-']


In [100]:
train_tag_ids = []
for sen_tag in train_tag_padded:
    train_tag_ids.append(np.array([tag_to_index[tag] for tag in sen_tag] + [0]*(MAX_SEQUENCE_LENGTH + 2- len(sen_tag))))

train_tag_ids = np.array(train_tag_ids)    

test_tag_ids = []
for sen_tag in test_tag_padded:
    test_tag_ids.append(np.array([tag_to_index[tag] for tag in sen_tag]+ [0]*(MAX_SEQUENCE_LENGTH + 2 - len(sen_tag))))

test_tag_ids = np.array(test_tag_ids)

In [101]:
bert_preprocessor = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [102]:
bert_preprocess_model = hub.KerasLayer(bert_preprocessor)

In [103]:
train_text_preprocess = bert_preprocess_model(train_text_new)
#train_text_preprocess.keys()
test_text_preprocess = bert_preprocess_model(test_text_new)
#test_text_preprocess.keys()

In [104]:
train_text_preprocess.keys()

dict_keys(['input_type_ids', 'input_word_ids', 'input_mask'])

In [105]:
train_input_ids = []

for input_words_id in train_text_preprocess['input_word_ids']:
    train_input_ids.append(np.array(input_words_id[:MAX_SEQUENCE_LENGTH+2]))
train_input_ids = np.array(train_input_ids)

train_type_ids = []
for input_types_id in train_text_preprocess['input_type_ids']:
    train_type_ids.append(np.array(input_types_id[:MAX_SEQUENCE_LENGTH+2]))
train_type_ids = np.array(train_type_ids)


train_input_masks = []
for input_masks in train_text_preprocess['input_mask']:
    train_input_masks.append(np.array(input_masks[:MAX_SEQUENCE_LENGTH+2]))
train_input_masks = np.array(train_input_masks)

In [106]:
test_input_ids = []

for input_words_id in test_text_preprocess['input_word_ids']:
    test_input_ids.append(np.array(input_words_id[:MAX_SEQUENCE_LENGTH+2]))
test_input_ids = np.array(test_input_ids)

test_type_ids = []
for input_types_id in test_text_preprocess['input_type_ids']:
    test_type_ids.append(np.array(input_types_id[:MAX_SEQUENCE_LENGTH+2]))
test_type_ids = np.array(test_type_ids)


test_input_masks = []
for input_masks in test_text_preprocess['input_mask']:
    test_input_masks.append(np.array(input_masks[:MAX_SEQUENCE_LENGTH+2]))
test_input_masks = np.array(test_input_masks)

In [107]:
print(len(train_input_ids[0]))
print(len(train_input_masks[0]))
print(len(train_type_ids[0]))
print(len(train_tag_ids[0]))

72
72
72
72


In [108]:
train_input_ids[0]

array([  101,  2122,  2026,  7133,  2187,  1999,  2037,  3182,  1006,
        2027,  2024,  2145,  2045,  1010,  2062, 14516,  1998,  2936,
        1011,  2973,  2084,  1996,  8213,  1997,  2158,  1007,  1998,
        8461,  2500,  2066,  2068,  1010,  2008, 17893,  2302,  6176,
       25069,  1012,   102,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [109]:
train_input_masks[0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [110]:
train_type_ids[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [111]:
train_tag_ids.shape

(57999, 72)

In [112]:
# One-hot encode tags
train_tags = to_categorical(train_tag_ids, num_classes=n_tags)
test_tags = to_categorical(test_tag_ids, num_classes=n_tags)

In [113]:
train_tags.shape

(57999, 72, 13)

In [121]:
print(train_tags[0])

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.

In [114]:
np.savez_compressed('train_input_ids.npz', train_input_ids)
np.savez_compressed('train_input_masks.npz', train_input_masks)
np.savez_compressed('train_type_ids.npz', train_type_ids)

np.savez_compressed('test_input_ids.npz', test_input_ids)
np.savez_compressed('test_input_masks.npz', test_input_masks)
np.savez_compressed('test_type_ids.npz', test_type_ids)

In [115]:
np.savez_compressed('train_tags.npz',train_tags)
np.savez_compressed('test_tags.npz',test_tags)