In [1]:
import numpy as np
import tensorflow

  from ._conv import register_converters as _register_converters


In [2]:
from tensorflow import keras

In [4]:
class Vocab(object):
    def __init__(self):
        self.rev_vocabulary_mapping = []  ## (V,E)
        self.vocabulary_mapping = {}  ## каждому слову индекс строки из матрицы E
        self.embeddings = None
    
    def unvectorize(self, ids):
        return ' '.join([self.rev_vocabulary_mapping[i] for i in ids])
    
    def vectorize(self, tokens):
        return [self.vocabulary_mapping.get(w, self.vocabulary_mapping['<UNK>']) for w in tokens]
    
    def load_embeddings(self, filename, embedding_size=50, cut_at=400000):
        # список векторов
        embeddings_list = []
        # словарь слово-индекс
        self.vocabulary_mapping = {'<PAD>': 0} # занесём ключ соотетствующий отступу
        self.rev_vocabulary_mapping.append('<PAD>')
        pad = np.zeros(embedding_size) # создадим вектор для PAD
        embeddings_list.append(pad)

        with open(filename,encoding='utf8') as glove_file: 
            i = 1
            for line in glove_file:
                fields = line.split()
                token = fields[0]
                vector = np.array(list(map(float, fields[1:])))
                
                embeddings_list.append(vector)
                self.vocabulary_mapping[token] = i
                self.rev_vocabulary_mapping.append(token)
                i += 1
                if i >= cut_at:
                    break
                    
        self.vocabulary_mapping['<UNK>'] = len(embeddings_list)
        self.rev_vocabulary_mapping.append('<UNK>')
        unk = np.mean(embeddings_list[1:], axis=0) # считаем средний вектор
        embeddings_list.append(unk)

        self.embeddings = np.array(embeddings_list)
                
        print(f'Loaded {self.embeddings.shape} embeddings')

In [5]:
vocab = Vocab()
vocab.load_embeddings('glove.6B.50d.txt')

Loaded (400001, 50) embeddings


In [6]:
v = vocab.vectorize(['mother','of','god'])
v

[809, 4, 1534]

In [7]:
vocab.unvectorize(v)

'mother of god'

### Загрузим ATIS

In [8]:
def load_csv(filename):
    results = []
    with open(filename) as f:
        for l in f:
            results.append(l.strip())
            
    return results

In [9]:
slots = load_csv('data/atis.dict.slots.csv')
intents = load_csv('data/atis.dict.intent.csv')
atis_vocab = load_csv('data/atis.dict.vocab.csv')

In [12]:
atis_train_queries = load_csv('data/atis.train.query.csv')
atis_train_intents = load_csv('data/atis.train.intent.csv')
atis_train_slots = load_csv('data/atis.train.slots.csv')

In [13]:
def atis_ids_to_words(ids, atis_vocab):
    return [atis_vocab[i] for i in ids]


In [14]:
atis_train_queries = [[int(index) for index in line.split()] for line in atis_train_queries]
atis_train_intents = [int(index) for index in atis_train_intents]
atis_train_slots = [[int(index) for index in line.split()] for line in atis_train_slots]

In [67]:
atis_train_queries_w = [[atis_vocab[i] for i in line] for line in atis_train_queries]
[' '.join([i for i in line]) for line in atis_train_queries_w]


['BOS i want to fly from boston at 838 am and arrive in denver at 1110 in the morning EOS',
 'BOS what flights are available from pittsburgh to baltimore on thursday morning EOS',
 'BOS what is the arrival time in san francisco for the 755 am flight leaving washington EOS',
 'BOS cheapest airfare from tacoma to orlando EOS',
 'BOS round trip fares from pittsburgh to philadelphia under 1000 dollars EOS',
 'BOS i need a flight tomorrow from columbus to minneapolis EOS',
 'BOS what kind of aircraft is used on a flight from cleveland to dallas EOS',
 'BOS show me the flights from pittsburgh to los angeles on thursday EOS',
 'BOS all flights from boston to washington EOS',
 'BOS what kind of ground transportation is available in denver EOS',
 'BOS show me the flights from dallas to san francisco EOS',
 'BOS show me the flights from san diego to newark by way of houston EOS',
 "BOS what 's the airport at orlando EOS",
 'BOS what is the cheapest flight from boston to bwi EOS',
 'BOS all fligh

In [17]:
train_sent = [vocab.vectorize(tokens[1:-1]) for tokens in atis_train_queries_w]

In [65]:
atis_train_queries_w

[['BOS',
  'i',
  'want',
  'to',
  'fly',
  'from',
  'boston',
  'at',
  '838',
  'am',
  'and',
  'arrive',
  'in',
  'denver',
  'at',
  '1110',
  'in',
  'the',
  'morning',
  'EOS'],
 ['BOS',
  'what',
  'flights',
  'are',
  'available',
  'from',
  'pittsburgh',
  'to',
  'baltimore',
  'on',
  'thursday',
  'morning',
  'EOS'],
 ['BOS',
  'what',
  'is',
  'the',
  'arrival',
  'time',
  'in',
  'san',
  'francisco',
  'for',
  'the',
  '755',
  'am',
  'flight',
  'leaving',
  'washington',
  'EOS'],
 ['BOS', 'cheapest', 'airfare', 'from', 'tacoma', 'to', 'orlando', 'EOS'],
 ['BOS',
  'round',
  'trip',
  'fares',
  'from',
  'pittsburgh',
  'to',
  'philadelphia',
  'under',
  '1000',
  'dollars',
  'EOS'],
 ['BOS',
  'i',
  'need',
  'a',
  'flight',
  'tomorrow',
  'from',
  'columbus',
  'to',
  'minneapolis',
  'EOS'],
 ['BOS',
  'what',
  'kind',
  'of',
  'aircraft',
  'is',
  'used',
  'on',
  'a',
  'flight',
  'from',
  'cleveland',
  'to',
  'dallas',
  'EOS'],
 ['

In [18]:
def build_classification_model(embeddings):
    # Точка входа в граф задаётся при помощи специальных тензоров типа Input
    # Первая координата соответсвует длине текста, так как тексты в датасете имеют разную длину
    # значение считается переменным
    text_input = keras.layers.Input(shape=(None,), dtype='int32') ## batch, num sentence = None, last = 1 - индексы из glove
    
    # Создаём специальный слой для работы с embedding, 
    # Его функция -- заменять индентификатор вектором из Glove
    # Указываем trainable = False, чтобы векторы embedding'ов не изменялись в процессе обучения
    embedding_layer = keras.layers.Embedding(input_dim = embeddings.shape[0], 
                                       output_dim = embeddings.shape[1], 
                                       weights=[embeddings],
                                       trainable = False) ## хорошая практика учить с замороженными embedding, а потом их разморозить
    
    x = embedding_layer(text_input)
    
    # Создаём рекуррентную ячейку
    # Первый параметр отвечает за размер внутреннего состояния (памяти ячейки)
    # По умолчанию такой слой возвращает только последнее состояние (см. картинку),
    # Если мы хотим получить состояния на каждом шаге необходимо указать return_sequences = True
    x = keras.layers.LSTM(128, recurrent_dropout=0.25)(x)
    
    # Полученный результат направляем в полносвязный слой, который будет осуществлять классификацию
    output = keras.layers.Dense(26, activation='softmax')(x)
    
    model = keras.models.Model(inputs=[text_input], outputs=[output], name = 'LSTM_classifier')
    
    return model

In [19]:
keras.backend.clear_session()
model = build_classification_model(vocab.embeddings)

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 50)          20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 26)                3354      
Total params: 20,095,052
Trainable params: 95,002
Non-trainable params: 20,000,050
_________________________________________________________________


In [21]:
adam = keras.optimizers.Adam(lr=0.0001)

model.compile(adam, keras.losses.sparse_categorical_crossentropy, metrics = ['acc'])

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(atis_train_queries, atis_train_intents, test_size = 0.1)

In [23]:
def generate_batches(x,y,batch_size = 64):
    i=0
    while True:
        i = i %len(x)
        yield keras.preprocessing.sequence.pad_sequences(x[i:i+batch_size]),y[i:i+batch_size]
        i += batch_size

In [37]:
train_generator = generate_batches(x_train, y_train)
val_generator = generate_batches(x_val,y_val)

train_steps  =len(x_train)/64
val_steps  =len(x_val)/64

In [39]:
model.fit_generator(generator=train_generator,
                    validation_data=val_generator,
                    steps_per_epoch = train_steps,
                    validation_steps  = val_steps,
                    workers=6)

Epoch 1/1


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1c8f66aba58>

In [46]:
def model_predict(val_generator):
    y_pred = model.predict_generator(val_generator,steps=val_steps)
    return y_pred

y_pred = model_predict(val_generator)
preds = [np.argmax(i) for i in y_pred]

[14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
