# ELMO

# (1) Скачиваем ELMO

In [1]:
!wget "http://vectors.nlpl.eu/repository/11/196.zip"

--2019-10-23 20:42:06--  http://vectors.nlpl.eu/repository/11/196.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206986345 (197M) [application/zip]
Saving to: ‘196.zip’


2019-10-23 20:42:14 (61.1 MB/s) - ‘196.zip’ saved [206986345/206986345]



In [2]:
!unzip '196.zip' -d 'ELMO'

Archive:  196.zip
  inflating: ELMO/meta.json          
  inflating: ELMO/model.hdf5         
  inflating: ELMO/options.json       
  inflating: ELMO/README             
  inflating: ELMO/vocab.txt          


In [0]:
import os
os.mkdir('bilm')

In [4]:
! pip install bilm

Collecting bilm
  Downloading https://files.pythonhosted.org/packages/22/a6/711e6ea5a05f7ce72f0a5c6c3bfbd1451aeb8810c9ec8074d5667e3ff433/bilm-0.1.post5-py3-none-any.whl
Installing collected packages: bilm
Successfully installed bilm-0.1.post5


In [0]:
%load_ext autoreload

import time
import numpy as np
import tensorflow as tf
from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings

tf.reset_default_graph()
elmo_path = 'ELMO'

In [6]:
batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(elmo_path)





Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






# (2) Скачиваем корпус


In [7]:
!wget 'https://www.dropbox.com/s/jaa5y82qzul6byn/quora_question_pairs_rus.csv'

--2019-10-23 20:45:46--  https://www.dropbox.com/s/jaa5y82qzul6byn/quora_question_pairs_rus.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.1, 2620:100:6021:1::a27d:4101
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/jaa5y82qzul6byn/quora_question_pairs_rus.csv [following]
--2019-10-23 20:45:46--  https://www.dropbox.com/s/raw/jaa5y82qzul6byn/quora_question_pairs_rus.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc992ace9b14419fdf1366c169e2.dl.dropboxusercontent.com/cd/0/inline/Aq_2oP13JyZE_Q9dxN7Umlo1FxUTWq1-thO45_5cMf9sWNlzJXqby-uuQ_y6MBMB26teYi_-h1SldrlrSpimkvQz26SBa3ldoHPCflvGtbHjzjv0JWyCYpAdWX0sWo_VwMc/file# [following]
--2019-10-23 20:45:46--  https://uc992ace9b14419fdf1366c169e2.dl.dropboxusercontent.com/cd/0/inline/Aq_2oP13JyZE_Q9dxN7Umlo1FxUTWq1-thO45_5cMf9sWNlzJXqby-uuQ_y6MBM

In [0]:
corpus = 'quora_question_pairs_rus.csv'

# Индексируем корпус


In [0]:
import csv

def get_data_elmo(corpus, stop=5000):
    """
    Проходит по корпусу и токенизирует тексты.

    :param corpus: path to csv file with corpus
    :param stop: int, how many lines we want to get
    :return: 
        indexed -> list of list of strings
        id_to_text -> dict, map of text_id to raw text. 
        query_to_dupl -> dict, query:id of its duplicate

    """
    indexed = []
    id_to_text = {}
    query_to_dupl_id = {}
    counter = 0

    with open(corpus, 'r', encoding='utf-8') as f:
        r = csv.reader(f)
        for line in r:

            if line[0] == '':
                continue

            _id, text, query, isduplicate = line
            id_to_text[_id] = text

            if isduplicate == '1':
                query_to_dupl_id[query] = _id
                
            indexed.append(tokenize(text))
                
            counter += 1
            if counter >= stop:
                break
    return indexed, id_to_text, query_to_dupl_id

In [0]:
cleaned, id_to_text, query_to_dupl_id = get_data_elmo(corpus, stop=50000)

In [0]:
def crop_vec(vect, sent):
    """
    Crops dummy values

    :param vect: np.array, vector from ELMo
    :param sent: list of str, tokenized sentence
    :return: np.array

    """
    cropped_vector = vect[:len(sent), :]
    cropped_vector = np.mean(cropped_vector, axis=0)
    return cropped_vector

In [0]:
def indexing(cleaned, batcher, sentence_character_ids, elmo_sentence_input):
    """ 
    Indexing corpus
    :param cleaned: list if lists of str, tokenized documents from the corpus
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model

    :return: matrix of document vectors
    """
    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.
        sess.run(tf.global_variables_initializer())
        indexed = []
        for i in range(200, len(cleaned)+1, 200):
            sentences = cleaned[i-200 : i]
            elmo_vectors = get_elmo_vectors(
                sess, sentences, batcher, sentence_character_ids, elmo_sentence_input)

            for vect, sent in zip(elmo_vectors, sentences):
                cropped_vector = crop_vec(vect, sent)
                indexed.append(cropped_vector)
    return indexed

In [13]:
from time import time
start = time()
indexed = indexing(cleaned, batcher, sentence_character_ids, elmo_sentence_input)
print('Затрачено секунд: ', time() - start)

Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 200
Sentences in this batch: 

Затрачено секунд:  3644.9178235530853


In [0]:
import pickle

with open('Indexed_ELMO.pickle', 'wb') as f:
    pickle.dump((indexed, id_to_text, query_to_dupl_id), f)

In [0]:
def cos_sim(v1, v2):
    """Counts cosine similarity between two vectors"""
    return np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [0]:
import pickle

with open('Indexed_ELMO.pickle', 'rb') as f:
    indexed, id_to_text, query_to_dupl_id = pickle.load(f)

In [0]:
def prepare_query(query, batcher, sentence_character_ids, elmo_sentence_input):
    """ 
    Gets vector of query

    :param query: str
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model
    
    :return: vector of query
    """
    q = [tokenize(query)]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        vector = crop_vec(get_elmo_vectors(sess, q, batcher,
                                           sentence_character_ids,
                                           elmo_sentence_input)[0], q[0])
    return vector

In [0]:
def search_tool_elmo(query, batcher, sentence_character_ids,
                     elmo_sentence_input, indexed):
    """
    Search query in corpus

    :param: query: str
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model
    :param indexed: np.array, matrix of indexed corpus

    :return: list, sorted results
    """
    q = prepare_query(query, batcher, sentence_character_ids, 
                      elmo_sentence_input)

    result = {}
    for i, doc_vector in enumerate(indexed):
        score =  cos_sim(q, doc_vector)
        if type(score) is np.float32:
            result[i] = score
    
    return sorted(result.items(), key=lambda x: x[1], reverse=True)
    

# Качество поиска

In [0]:
def get_score_elmo(indexed, query_to_dupl_id, batcher, sentence_character_ids, 
                   elmo_sentence_input, test=100):
    """
    Counts the quality of the search (from 0 to 1.0)
    """
    test_query = list(query_to_dupl_id.keys())

    if test != 0:
        test_query =  test_query[:test]
    
    test_len = len(test_query)
    counter = 0

    for q in test_query:
        dupl_id = int(query_to_dupl_id[q])

        results = search_tool_elmo(q, batcher, sentence_character_ids,
                                   elmo_sentence_input, indexed)[:5]
        text_ids = [result[0] for result in results]

        if dupl_id in text_ids:
            counter += 1
    
    return counter / test_len

In [0]:
start = time()
print(get_score_elmo(indexed,query_to_dupl_id, batcher, sentence_character_ids, 
                   elmo_sentence_input))
print('Затрачено времени', time() - start)