In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from dataset import get_vocab, index_sents
from embedding import create_embeddings

Using TensorFlow backend.


In [2]:
MAX_VOCAB = 18000
EMBEDDING_SIZE = 128
TEST_SIZE = 0.15

### read metaphor corpus from csv (first save as utf-8!)

In [3]:
data = pd.read_csv('data/sentences_utf8.csv')

In [4]:
data.head()

Unnamed: 0,Sentence Number,Sentence
0,0,latest corporate unbundler reveals@ laid-back ...
1,1,by frank kane
2,2,it seems that roland franklin the latest unbun...
3,3,he has not properly investigated the target@'s...
4,4,the #-year-old head@ of pembridge investments ...


In [5]:
rawsents = data['Sentence'].tolist()
raw_edit = [str(s) for s in rawsents]
print('raw lens', len(raw_edit))
raw_counts = Counter(raw_edit)
print('nans:', raw_counts['nan'])
raw_edit = [r for r in raw_edit if r != 'nan']
print('end lens', len(raw_edit))

raw lens 16202
nans: 54
end lens 16148


In [6]:
%%time
sents = []
labels = []

for raw in raw_edit:
    this_sent = []
    this_labels = []
    lst = raw.split(' ')
    for w in lst:
        if '@' in w:
            w = w.replace('@', '')
            l = 1.0
        else:
            l = 0.0
        this_sent.append(w)
        this_labels.append(l)
    sents.append(this_sent)
    labels.append(this_labels)
        
print(len(sents), len(labels))

16148 16148
CPU times: user 160 ms, sys: 4 ms, total: 164 ms
Wall time: 160 ms


### get vocabularies and index inputs

In [7]:
# text vocab dicts
vocab, word2idx, idx2word = get_vocab(sents, MAX_VOCAB)


total vocab size: 18001 


trunc vocab size: 17998 



In [8]:
# index sentences
sents_idx = index_sents(sents, word2idx)

### create word2vec embeddings for words, pos-tags

In [9]:
# sentence embeddings - use 'sents' so no ner tags
with open('embeddings/temp_text.txt', 'w') as f:
    for s in sents:
        f.write(' '.join(s))
        f.write('\n')

w2v_vocab, w2v_model = create_embeddings('embeddings/temp_text.txt',
                       embeddings_path='embeddings/text_embeddings.gensimmodel',
                       vocab_path='embeddings/text_mapping.json',
                       workers=7,
                       min_count=1,
                       size=EMBEDDING_SIZE,
                       iter=50)

### train-test splitting

In [10]:
X_train, X_test, y_train, y_test = train_test_split(sents_idx, labels, test_size=TEST_SIZE)

### save everything to numpy binaries for loading

In [11]:
def numpy_save(saves, names):
    for idx, item in enumerate(saves):
        np.save('data/{0}.npy'.format(names[idx]), item)
    return

saves = [
vocab,
word2idx, 
idx2word,
X_train,
X_test,
y_train,
y_test]

names = [
'vocab',
'word2idx',
'idx2word',
'X_train',
'X_test',
'y_train',
'y_test']

numpy_save(saves, names)

In [12]:
sum([len(s) for s in sents])/len(sents), max([len(s) for s in sents])

(12.7669680455784, 99)

In [13]:
len(sents)

16148