In [1]:
import pandas as pd
import numpy as np
from dataset import get_vocab, index_sents
from embedding import create_embeddings
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
MAX_VOCAB = 25000
TEST_SIZE = 0.15

### read ConLL2002 NER corpus from csv (first save as utf-8!)

In [3]:
data = pd.read_csv('data/ner_dataset_utf8.csv')

In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
sentmarks = data["Sentence #"].tolist()
sentmarks = [str(s) for s in sentmarks]
sentmarks[:5]

['Sentence: 1', 'nan', 'nan', 'nan', 'nan']

In [6]:
words = data["Word"].tolist()
postags = data["POS"].tolist()
nertags = data["Tag"].tolist()

In [7]:
sentence_text = []
sentence_post = []
sentence_ners = []

vocab = []

this_snt = []
this_pos = []
this_ner = []

for idx, s in enumerate(sentmarks):
    # reset if new sent
    if s != 'nan':
        # edit: ONLY IF HAS TAG!
    
        if len(this_snt) > 0 and this_snt[-1] == '0':
            if list(set(this_ner)) != ['O']:
                sentence_text.append(this_snt[:-1])
                sentence_post.append(this_pos[:-1])
                sentence_ners.append(this_ner[:-1])
        this_snt = []
        this_pos = []
        this_ner = []
    
    # add to lists 
    this_snt.append(words[idx].lower())
    this_pos.append(postags[idx])
    this_ner.append(nertags[idx])
    vocab.append(words[idx].lower())

In [8]:
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_post[idx])
    print(sentence_ners[idx])
    print('')

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O']

['they', 'marched', 'from', 'the', 'houses', 'of', 'parliament', 'to', 'a', 'rally', 'in', 'hyde', 'park']
['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', 'TO', 'DT', 'NN', 'IN', 'NNP', 'NNP']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo']



### get vocabularies and index inputs

In [9]:
# text vocab dicts
word2idx, idx2word = get_vocab(sentence_text, MAX_VOCAB)


total vocab size: 29341 


trunc vocab size: 24998 



In [10]:
# POS and NER tag vocab dicts
# add 2 for UNK, PAD (otherwise will truncate 2 tags)
pos2idx, idx2pos = get_vocab(sentence_post, len(set(postags))+2)
ner2idx, idx2ner = get_vocab(sentence_ners, len(set(nertags))+2)


total vocab size: 42 


trunc vocab size: 42 


total vocab size: 17 


trunc vocab size: 17 



In [11]:
# index
sentence_text_idx = index_sents(sentence_text, word2idx)
sentence_post_idx = index_sents(sentence_post, pos2idx)
sentence_ners_idx = index_sents(sentence_ners, ner2idx)

### create word2vec embeddings for words, pos-tags

In [12]:
# sentence embeddings

with open('embeddings/sent_text.txt', 'w') as f:
    for s in sentence_text:
        f.write(' '.join(s))
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('embeddings/sent_text.txt',
                       embeddings_path='embeddings/text_embeddings.gensimmodel',
                       vocab_path='embeddings/text_mapping.json',
                       workers=4,
                       iter=20)

In [13]:
# pos embeddings

with open('embeddings/sent_pos.txt', 'w') as f:
    for s in sentence_text:
        f.write(' '.join(s))
        f.write('\n')

w2v_pvocab, w2v_pmodel = create_embeddings('embeddings/sent_pos.txt',
                         embeddings_path='embeddings/pos_embeddings.gensimmodel',
                         vocab_path='embeddings/pos_mapping.json',
                         workers=4,
                         iter=20)

### train-test splitting

In [14]:
indices = [i for i in range(len(sentence_text))]

train_idx, test_idx, X_train_pos, X_test_pos = train_test_split(indices, sentence_post_idx, test_size=TEST_SIZE)

def get_sublist(lst, indices):
    result = []
    for idx in indices:
        result.append(lst[idx])
    return result

X_train_sents = get_sublist(sentence_text_idx, train_idx)
X_test_sents = get_sublist(sentence_text_idx, test_idx)
y_train_ner = get_sublist(sentence_ners_idx, train_idx)
y_test_ner = get_sublist(sentence_ners_idx, test_idx)

### save everything to numpy binaries for loading

In [15]:
def numpy_save(saves, names):
    for idx, item in enumerate(saves):
        np.save('data/{0}.npy'.format(names[idx]), item)
    return

saves = [
vocab,
sentence_text_idx,
sentence_post_idx,
sentence_ners_idx,
word2idx, idx2word,
pos2idx, idx2pos,
ner2idx, idx2ner,
train_idx,
test_idx,
X_train_sents,
X_test_sents,
X_train_pos,
X_test_pos,
y_train_ner,
y_test_ner]

names = [
'vocab',
'sentence_text_idx',
'sentence_post_idx',
'sentence_ners_idx',
'word2idx', 'idx2word',
'pos2idx', 'idx2pos',
'ner2idx', 'idx2ner',
'train_idx',
'test_idx',
'X_train_sents',
'X_test_sents',
'X_train_pos',
'X_test_pos',
'y_train_ner',
'y_test_ner']

numpy_save(saves, names)

In [16]:
saves = [
sentence_text,
sentence_post,
sentence_ners,]

names = [
'sentence_text',
'sentence_post',
'sentence_ners']

numpy_save(saves, names)

In [17]:
sum([len(s) for s in sentence_text])/len(sentence_text), max([len(s) for s in sentence_text])

(21.71625782103486, 103)

In [18]:
len(sentence_text)

40276