In [1]:
import sys
sys.path.append('./utils')

In [2]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import LSTM, Dropout, Merge, Embedding, Dense
from keras.models import Sequential
from keras import optimizers
from sklearn.model_selection import train_test_split
from metrics import evaluate_model
from embeddings import average_weights_embeddings, save_embeddings

Using TensorFlow backend.


In [3]:
DIM = 100
MAXLEN_SEQ = 550
DROPOUT = 0.2
BATCH_SIZE = 128
EPOCHS = 10
FILTERS = (8, 16, 32, 64)
KERNEL_SIZE = (4, 8, 10, 12)
BRANCHES = len(FILTERS)

In [4]:
df = pd.read_excel('training_set.xls')

In [5]:
df['target'] = df[['domain1_score', 'domain2_score']].apply(lambda x: np.nanmean(x), axis=1)

  """Entry point for launching an IPython kernel.


In [6]:
df = df[['essay', 'target']]

In [7]:
df.dropna(axis=0, inplace=True)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['essay'])
sequences = tokenizer.texts_to_sequences(df['essay'])

In [9]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print('Найдено %s уникальных токенов.' % len(word_index))

Найдено 41432 уникальных токенов.


In [10]:
X = pad_sequences(sequences, maxlen=MAXLEN_SEQ)

In [11]:
y = df['target'].astype('float32')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
assert len(FILTERS) == len(KERNEL_SIZE)
submodels = []
for filters, kernel in zip(FILTERS, KERNEL_SIZE):
    submodel = Sequential()
    submodel.add(Embedding(vocab_size, 
                           DIM, 
                           name='emb-{}'.format(filters)))
    submodel.add(Dropout(DROPOUT))
    submodel.add(Conv1D(filters=filters,
                        kernel_size=kernel,
                        padding='same',
                        activation='relu'))
    submodel.add(MaxPooling1D(pool_size=2))
    submodels.append(submodel)
model = Sequential()
model.add(Merge(submodels, mode="concat"))
model.add(LSTM(128, return_sequences=True, dropout=DROPOUT))
model.add(LSTM(64, dropout=DROPOUT))
model.add(Dense(1, activation='relu'))
print('Compiling model')
model.compile(loss='mse',
              optimizer=optimizers.rmsprop(lr=0.002),
              metrics=['mae'])

  app.launch_new_instance()


Compiling model


In [14]:
model.fit([X_train] * BRANCHES,
          y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=([X_test] * BRANCHES, y_test))

Train on 10381 samples, validate on 2596 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f87f54b6a20>

In [15]:
evaluate_model(model, X_test, y_test, BRANCHES)

r2 score: 0.9417960480384787
explained variance score : 0.9439420700073242
kappa: 0.9687792624798369


In [16]:
embedding = average_weights_embeddings(FILTERS, model)

Усредняем веса 4 матриц весов эмбеддингов


In [17]:
save_embeddings(embedding, word_index, 'embeddings.txt')

Готово! Загружено 41432 векторов слов размерностью 100.
