In [1]:
from keras.models import Sequential, load_model
import numpy as np
from keras.layers.recurrent import LSTM
from keras.layers import *
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Branje podatkov

In [2]:
OUT_FILE = './podatki/besede_oznake.tsv'                                        
                                                                                
with open('./podatki/ssj500k20.vert') as f:                                     
    outfile = open(OUT_FILE, 'w')                                               
    for line in f:                                                              
        line_items = re.split(r'\t+', line)                                     
        if len(line_items) == 14:                                               
            outfile.write('{}\t{}\n'.format(line_items[0], line_items[2]))      
        elif line.startswith('</s>'):                                           
            outfile.write('\n')                                                 
    outfile.close()  

In [3]:
vsebina = open(OUT_FILE).readlines()

stavki = []
stavek = []

oznake = []
oznake_stavka = []

for vrstica in vsebina:
    vrstica = vrstica.strip().split('\t')
    if len(vrstica) == 2:
        stavek.append(vrstica[0])
        oznake_stavka.append(vrstica[1][0])
    else:
        stavki.append(stavek)
        stavek = []
        oznake.append(oznake_stavka)
        oznake_stavka = []

if len(stavek) > 0:
    stavki.append(stavek)
    oznake.append(oznake_stavka)

In [4]:
maxlen_stavek = 64
filtrirani_stavki = []
filtrirane_oznake = []
for i in range(len(stavki)):
    if len(stavki[i]) <= maxlen_stavek:
        filtrirani_stavki.append(stavki[i])
        filtrirane_oznake.append(oznake[i])

In [5]:
maxlen_beseda = 16
stavki = filtrirani_stavki
oznake = filtrirane_oznake
filtrirani_stavki = []
filtrirane_oznake = []
for i in range(len(stavki)):
    if max([len(beseda) for beseda in stavki[i]]) <= maxlen_beseda:
        filtrirani_stavki.append(stavki[i])
        filtrirane_oznake.append(oznake[i])
stavki = filtrirani_stavki
oznake = filtrirane_oznake

## Priprava vektorja razredov

In [6]:
def oznaka2vektor(oznaka, slovar_oznak, dim_oznak):
    """ One-hot"""
    vektor = np.zeros(dim_oznak)
    vektor[slovar_oznak[oznaka]] = 1
    return vektor

def vektor2oznaka(vektor, mozne_oznake):
    return mozne_oznake[np.where(vektor == 1)[0][0]]

mozne_oznake = set()
for o in oznake:
    mozne_oznake |= set(o)
    
mozne_oznake = sorted(list(mozne_oznake))
slovar_oznak = {oznaka: i for (i, oznaka) in enumerate(mozne_oznake)}
dim_oznak = len(mozne_oznake)

y = pad_sequences(
    [pad_sequences(
        np.array([oznaka2vektor(oznaka, slovar_oznak, dim_oznak) for oznaka in o]).T
    ).T for o in oznake])
assert y.shape == (len(oznake), maxlen_stavek, dim_oznak)

## Priprava vektorja znacilk

In [7]:
mozni_znaki = set()
for s in stavki:
    for beseda in s:
        mozni_znaki |= set(beseda)
mozni_znaki = sorted(list(mozni_znaki))
slovar_znakov = {znak: i+1 for (i, znak) in enumerate(mozni_znaki)}
    
dim_znakov = len(slovar_znakov)
x = pad_sequences([pad_sequences([[slovar_znakov[z] for z in b] for b in stavek], maxlen=maxlen_beseda) for stavek in stavki], maxlen=maxlen_stavek)
assert x.shape == (len(stavki), maxlen_stavek, maxlen_beseda)

## Gradnja modela

In [15]:
cnn = Sequential()
cnn.add(Embedding(dim_znakov, 15, input_length=maxlen_beseda))

#filter_widths = list(range(1, 8))
#n_filters = [min(200, 50*w) for w in filter_widths]

filter_widths = list(range(1, 7))
n_filters = [25*w for w in filter_widths]

for i in range(len(filter_widths)):
    cnn.add(Conv1D(filters=n_filters[i], kernel_size=filter_widths[i], padding='causal', activation='tanh'))
    cnn.add(Dropout(0.1))
cnn.add(MaxPooling1D(pool_size=(2)))
cnn.add(Flatten())

model = Sequential()
model.add(Masking(mask_value=0., input_shape=(maxlen_stavek, maxlen_beseda)))
model.add(TimeDistributed(cnn))
model.add(Bidirectional(LSTM(300, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, implementation=2), merge_mode='concat'))
model.add(Dense(dim_oznak, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Učenje modela

In [16]:
earlystop_cb = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
model.fit(x[:20000], y[:20000], epochs=40, validation_split=0.05, callbacks=[earlystop_cb])

Train on 19000 samples, validate on 1000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40


<keras.callbacks.History at 0x7f69284eb518>

## Evalvacija

In [17]:
(loss, accuracy) = model.evaluate(x[20000:25000], y[20000:25000])
print(loss)
print(accuracy)

0.09513550627827644
0.971920181274414


In [18]:
model.save('model2.h5')

## Primeri

In [19]:
obrnjen_slovar_znakov = {kljuc: znak for (znak, kljuc) in slovar_znakov.items()}

def unpad_stavek(vlozitve):
    stavki = []
    for v_stavek in vlozitve:
        stavki.append([])
        for v_beseda in v_stavek:
            if sum(v_beseda) == 0:
                continue
                
            stavki[-1].append('')
            for v_znak in v_beseda:
                if v_znak == 0:
                    continue
                stavki[-1][-1] += obrnjen_slovar_znakov[v_znak]
    return stavki

In [38]:
print(mozni_znaki)
print()
print(len(mozni_znaki))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '©', '«', '¯', '°', 'µ', '»', 'Á', 'Ä', 'Å', 'É', 'Ó', 'Ö', '×', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ó', 'ô', 'ö', 'ø', 'ü', 'Ć', 'ć', 'Č', 'č', 'Đ', 'đ', 'Š', 'š', 'Ž', 'ž', '–', '—', '‘', '’', '“', '”', '•', '…']

136
