# Text Preprocessing

generate integer-indexed sentences, pos-tags and named entity tags, dictionaries for converting, etc, and save as `npy` binaries.

In [1]:
import pandas as pd
import numpy as np
from preprocessing import get_vocab, index_sents
from embedding import create_embeddings
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# set maximum network vocabulary, test set size
MAX_VOCAB = 25000
TEST_SIZE = 0.15

### read ConLL2002 NER corpus from csv (first save as utf-8!)

In [3]:
data = pd.read_csv('data/ner_dataset_utf8.csv')

In [4]:
sentmarks = data["Sentence #"].tolist()
sentmarks = [str(s) for s in sentmarks]
sentmarks[:5]

['Sentence: 1', 'nan', 'nan', 'nan', 'nan']

In [5]:
words = data["Word"].tolist()
postags = data["POS"].tolist()
nertags = data["Tag"].tolist()

In [6]:
sentence_text = []
sentence_post = []
sentence_ners = []

vocab = []

this_snt = []
this_pos = []
this_ner = []

for idx, s in enumerate(sentmarks):
    # reset if new sent
    if s != 'nan':
        # edit: ONLY IF HAS TAG!
    
        if len(this_snt) > 0 and this_snt[-1] == '0':
            if list(set(this_ner)) != ['O']:
                sentence_text.append(this_snt[:-1])
                sentence_post.append(this_pos[:-1])
                sentence_ners.append(this_ner[:-1])
        this_snt = []
        this_pos = []
        this_ner = []
    
    # add to lists 
    this_snt.append(words[idx].lower())
    this_pos.append(postags[idx])
    this_ner.append(nertags[idx])
    vocab.append(words[idx].lower())

In [7]:
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_post[idx])
    print(sentence_ners[idx])
    print('')

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O']

['they', 'marched', 'from', 'the', 'houses', 'of', 'parliament', 'to', 'a', 'rally', 'in', 'hyde', 'park']
['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', 'TO', 'DT', 'NN', 'IN', 'NNP', 'NNP']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo']



## get vocabulary and index inputs

we need to convert the string input to integer vectors for the `keras` network (the `pycrfsuite` network needs strings, as it will extract feature vectors from words themselves).

we will index each word from 1 according to inverse frequency (most common word is 1, etc.) until the max-vocab size. We will reserve two slots, 0 for the PAD index, and MAX_VOCAB-1 for out-of-vocabulary or unknown words (OOV/UNK). Since this is boring stuff, I've put it in external functions. Packages like `keras` and `sklearn` have more robust tools for this, but a simple word:index dictionary will do fine for this experiment

In [8]:
# text vocab dicts
# subtract 2 for UNK, PAD
word2idx, idx2word = get_vocab(sentence_text, MAX_VOCAB-2)

In [9]:
# POS and NER tag vocab dicts
pos2idx, idx2pos = get_vocab(sentence_post, len(set(postags)))
ner2idx, idx2ner = get_vocab(sentence_ners, len(set(nertags))+2)

In [10]:
# index
sentence_text_idx = index_sents(sentence_text, word2idx)
sentence_post_idx = index_sents(sentence_post, pos2idx)
sentence_ners_idx = index_sents(sentence_ners, ner2idx)

## train-test splitting

we divide the training data into training data, and testing data. the testing data is used only for checking model performance. a third set, the *validation set*, may be split off from our training data for hyperparameter tuning, although if we use k-fold cross-validation, our validation set will change every fold.

In [11]:
indices = [i for i in range(len(sentence_text))]

train_idx, test_idx, X_train_pos, X_test_pos = train_test_split(indices, sentence_post_idx, test_size=TEST_SIZE)

def get_sublist(lst, indices):
    result = []
    for idx in indices:
        result.append(lst[idx])
    return result

X_train_sents = get_sublist(sentence_text_idx, train_idx)
X_test_sents = get_sublist(sentence_text_idx, test_idx)
y_train_ner = get_sublist(sentence_ners_idx, train_idx)
y_test_ner = get_sublist(sentence_ners_idx, test_idx)

## create word2vec embeddings for words, pos-tags

using pre-trained embedding vectors to initialize the embedding layer has been shown to help training for various sequence labeling tasks such as POS tagging (Huang, Xu & Yu 2015; Ma & Hovy 2016) and Named Entity Recognition for English (Ma & Hovy 2016; Lee Changki 2017) and Japanese (Misawa, Taniguchi, Miura & Ohkuma 2017).

because we are using the POS-tags as a secondary input, we will also train an embedding space fo these. we will use only the training data to create the embeddings. i am using `gensim` for this task, and i am using a helper function to wrap the `Word2Vec` that saves the embedding and also the vocabulary dictionary.

In [14]:
# sentence embeddings
train_sent_texts = [sentence_text[idx] for idx in train_idx]
        
w2v_vocab, w2v_model = create_embeddings(train_sent_texts,
                       embeddings_path='embeddings/text_embeddings.gensimmodel',
                       vocab_path='embeddings/text_mapping.json',
                       size=300,
                       workers=4,
                       iter=20)

In [15]:
# pos embeddings
train_post_texts = [sentence_post[idx] for idx in train_idx]

w2v_pvocab, w2v_pmodel = create_embeddings(train_post_texts,
                         embeddings_path='embeddings/pos_embeddings.gensimmodel',
                         vocab_path='embeddings/pos_mapping.json',
                         size=100,
                         workers=4,
                         iter=20)

## save everything to numpy binaries for loading

granted, `pickle` would probably be more suitable for a lot of these things. but over-reliance on `numpy` binaries is a bad habit i've picked up.

In [16]:
def numpy_save(saves, names):
    for idx, item in enumerate(saves):
        np.save('encoded/{0}.npy'.format(names[idx]), item)
    return

saves = [
vocab,
sentence_text_idx,
sentence_post_idx,
sentence_ners_idx,
word2idx, idx2word,
pos2idx, idx2pos,
ner2idx, idx2ner,
train_idx,
test_idx,
X_train_sents,
X_test_sents,
X_train_pos,
X_test_pos,
y_train_ner,
y_test_ner,
sentence_text,
sentence_post,
sentence_ners]

names = [
'vocab',
'sentence_text_idx',
'sentence_post_idx',
'sentence_ners_idx',
'word2idx', 'idx2word',
'pos2idx', 'idx2pos',
'ner2idx', 'idx2ner',
'train_idx',
'test_idx',
'X_train_sents',
'X_test_sents',
'X_train_pos',
'X_test_pos',
'y_train_ner',
'y_test_ner',
'sentence_text',
'sentence_post',
'sentence_ners']

numpy_save(saves, names)