In [12]:
import pandas as pd 
import ezodf
from os import listdir
from os.path import isfile, join
import numpy as np
from datetime import datetime
from elasticsearch import Elasticsearch
import time
import ast
import json

import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

from elasticsearch_dsl import MultiSearch, Search

In [13]:
%matplotlib inline

In [14]:
### clean indices from elastic_search object
def delete_all_indices(elastic_search):
    for index in list(elastic_search.indices.get_alias('*').keys()):
        elastic_search.indices.delete(index)
        elastic_search.indices.clear_cache()
    return elastic_search

In [15]:
PATH_TO_SQUAD_TRAIN = '../train-v1.1.json'

In [16]:
from pandas.io.json import json_normalize

with open(PATH_TO_SQUAD_TRAIN) as f:
    SberQuAD = json.load(f)['data']
    
sberquad_df = json_normalize(SberQuAD,
    record_path=['paragraphs', 'qas', 'answers'], 
    meta=[
        'title', 
        ['paragraphs', 'context'], 
        ['paragraphs', 'qas', 'id'], 
        #['paragraphs', 'qas', 'plausible_answers'], 
        ['paragraphs', 'qas', 'question']
    ],
    #errors='ignore'
    )

  del sys.path[0]


In [17]:
from elasticsearch_dsl import analyzer, token_filter
from collections import defaultdict


def create_index_settings(settings, special_properties, simple_properties, spec={'type': 'text', 
                                                                                 'analyzer': 'my_analyzer'}):
    _props = defaultdict(dict)
    for prop in special_properties:
        _props[prop] = spec
    for prop in simple_properties:
        _props[prop] = { 'type': 'text'}
        
    return {
        'settings': settings,
        'mappings': {
            '_doc': {
                'properties': dict(_props)
            }
        }
    }


def create_query_body(query_type, field_name, query):
    ### main case with fuzzy
    if query_type == 'fuzzy':
        return {
            'query': {
                'match': {
                    field_name: {
                       'query': query,
                       'fuzziness': 2,
                       'prefix_length': 3,
                       'max_expansions': 300,
                       'operator': 'or',
                    }
                }
            },
        }
    else:
        return {}
    
    
def search_param(query_type, field_name, query, index):
    return {
        'query_body': create_query_body(query_type, field_name, query),
        'index_name': index,
        'search_field': field_name,
    }


def stack_uneven(arrays, fill_value=0.):
    '''
    Fits arrays into a single numpy array, even if they are
    different sizes. `fill_value` is the default value.

    Args:
            arrays: list of np arrays of various sizes
                (must be same rank, but not necessarily same size)
            fill_value (float, optional):

    Returns:
            np.ndarray
    '''
    sizes = [a.shape for a in arrays]
    max_sizes = np.max(list(zip(*sizes)), -1)
    # The resultant array has stacked on the first dimension
    result = np.full((len(arrays),) + tuple(max_sizes), fill_value)
    for i, a in enumerate(arrays):
      # The shape of this array `a`, turned into slices
      slices = tuple(slice(0,s) for s in sizes[i])
      # Overwrite a block slice of `result` with this array `a`
      result[i][slices] = a
    return result


def correct_answer_ids(current_page_numbers, current_document_name, dataset):
    page_number_mask = dataset['page_numbers_list'].apply(lambda x: len(set(current_page_numbers).intersection(x)) > 0)
    document_name_mask = dataset['document_name'].apply(lambda x: x == current_document_name)
    exact_df = dataset[page_number_mask & document_name_mask]
    return set(exact_df.index)


def full_metrics_count(relevance_matrix):
    results = []
    raw_relevance_matrix = relevance_matrix.copy()
    print ('NDCG-5: ', np.array(list(map(lambda x: ndcg_at_k(x, 5), raw_relevance_matrix))).mean())
    results.append(np.array(list(map(lambda x: ndcg_at_k(x, 5), raw_relevance_matrix))).mean())
    print ('NDCG-10: ', np.array(list(map(lambda x: ndcg_at_k(x, 10), raw_relevance_matrix))).mean())
    results.append(np.array(list(map(lambda x: ndcg_at_k(x, 10), raw_relevance_matrix))).mean())
    print ('NDCG-20: ', np.array(list(map(lambda x: ndcg_at_k(x, 20), raw_relevance_matrix))).mean())
    results.append(np.array(list(map(lambda x: ndcg_at_k(x, 20), raw_relevance_matrix))).mean())
    print('\n\n')
    return results


def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg


def print_metric_results(full_scores_df, columns, N_list):
    _metrics = []
    for column in columns:
        current_positions = full_scores_df.sort_values(['question', column], ascending=False).\
            groupby('question')['is_correc_answer'].apply(np.array).apply(np.argmax).values
        print (column)
        for N in N_list:
            print ('TOP-{}: {}'.format(N, (current_positions < N).mean() * 100))
        
        
        current_df = full_scores_df.sort_values(['question', column], ascending=False)
        relevances_matrix = current_df.groupby('question')['is_correc_answer'].apply(np.array).values
        relevances_matrix = np.stack(relevances_matrix, axis=0) #if not use_stack_uneven else stack_uneven(relevances_matrix)
        _metrics.append(full_metrics_count(relevances_matrix))
    return _metrics
        

In [18]:
es = Elasticsearch(["http://admin:admin@localhost:9200"])

In [20]:
ANALYSIS_list = [
    {
            "filter": {
                "en_US": {
                    "type": "hunspell",
                    "language": "en-US"
                },
                "ru": {
                    "type": "hunspell",
                    "language": "ru"
                }
            },
            "analyzer": {
                "en_US": {
                    "tokenizer": "standard",
                    "filter": [ "lowercase", "en_US" ]
                },
                "ru": {
                    "tokenizer": "standard",
                    "filter": [ "lowercase", "ru" ]
                }
            }
        },
    
    {
            "analyzer" : {
                "eng_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["lowercase", "eng_stemmer"]
                },
                "ru_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["lowercase", "ru_stemmer"]
                }
            },
            "filter" : {
                "eng_stemmer" : {
                    "type" : "stemmer",
                    "name" : "english"
                },
                "ru_stemmer" : {
                    "type" : "stemmer",
                    "name" : "russian"
                }
            }
        },
]

In [21]:
fields=['context']

In [25]:
sberquad_df.shape[0]//500+1

90

# MAIN LOOP

In [None]:
number_of_groups = sberquad_df.shape[0]//500+1

ndcg_list = []

for _i in range(number_of_groups):

    _squad_df = sberquad_df.loc[_i*500:(_i+1)*500]

    squad_df_pairs = _squad_df[['paragraphs.qas.question', 'paragraphs.context']]
    squad_df_pairs['question'] = squad_df_pairs['paragraphs.qas.question']
    squad_df_docs = _squad_df['paragraphs.context']

    squad_df_pairs['doc_id'] = squad_df_pairs['paragraphs.context'].apply(lambda x: 
                                                        squad_df_docs[squad_df_docs == x].index.values[0])
    dataset = squad_df_docs
    try:
        # Delete indices
        delete_all_indices(es)
    except:
        pass

    new_indexes = defaultdict(str)

    for e, i in enumerate(ANALYSIS_list):
        _settings = {
            'index':{
                'blocks':{
                    'read_only_allow_delete' : 'false'
                }
            },
            'number_of_replicas':0,
            'number_of_shards': 1,
            "analysis": i
        }


        _ = str(e) + '_index'

        _props = {'type': 'text', 
                      'analyzer': list(i['analyzer'].keys())[1], 
                     }
        new_indexes[_.lower()] = create_index_settings(_settings, 
                                                 fields,
                                                 ['document_name'],
                                                 _props)

    # Create new indices
    for index_name, index_settings in tqdm_notebook(new_indexes.items()):
        es.indices.create(index_name, body=index_settings, include_type_name=True)
        for i in tqdm_notebook(dataset.index):
            es.index(index=index_name, doc_type='_doc',
                     body={'context': dataset[i]}, id=i)
        es.indices.refresh(index_name)
        

    # Create parameters for queries
    search_params_dict = defaultdict(str)
    for i in new_indexes.keys():
        for f in fields:
            search_params_dict[i + '_' + f] = search_param('fuzzy', f, 'question', i)

    # Create names and dataframe for scores
    score_columns = [col + '_score' for col in list(search_params_dict.keys())]
    global full_scores_df
    full_scores_df = pd.DataFrame(columns=(['question', 
                                            'doc_id'] + score_columns))


    questions_list = []
    document_ids_list = []
    document_names_list = []
    document_pages_list = []

    for question in tqdm_notebook(squad_df_pairs['question'].value_counts().index):
        for i in dataset.index:
            questions_list.append(question)
            document_ids_list.append(i)

    # fill dataframe values
    full_scores_df['question'] = questions_list
    full_scores_df['doc_id'] = document_ids_list
    for score_column in tqdm_notebook(score_columns):
        full_scores_df[score_column] = 0

    ### create index    
    full_scores_df['index'] = list(map(lambda x1, x2: 
                                       (x1, x2), full_scores_df['question'], full_scores_df['doc_id']))
    full_scores_df.set_index('index', inplace=True)

    set_questions = set(questions_list)

    # Eighth part
    ### iterating by question
    for i in tqdm_notebook(set_questions):
        current_question = i
        current_indexes = list(map(lambda x: (current_question, x), dataset.index))


        ### iterating by search parameters
        for search_name, search_params in search_params_dict.items():
            ### create query body
            current_query_body = search_params['query_body']
            current_query_body['query']['match'][search_params['search_field']]['query'] = current_question

            ### run search


            search_results = es.search(index=search_params['index_name'], body=current_query_body, 
                                       params={'size': len(dataset), 'search_type' : 'dfs_query_then_fetch'})
            search_results = search_results['hits']['hits']
            ### save results to dataset
            ### fill values in dataframe
            scores_list = list(map(lambda x: x['_score'], search_results))
            ids_list = list(map(lambda x: int(x['_id']), search_results))
            score_dict = dict(zip(ids_list, scores_list))
            full_scores_df.loc[current_indexes, search_name + '_score'] = list(map(lambda x: 
                                                                                   score_dict.get(x, 0), 
                                                                                   dataset.index))

    # Nineth part

    ### now, we need create labels for (q,d) pairs - some of them would be 1, for correct documents.

    ### we just checked numbers of pages - if intersetion of question pages and document pages is not null, 
    ### then this pair is correct
    squad_df_pairs['correct_answer_id'] = squad_df_pairs['doc_id']


    correct_answers_dict = squad_df_pairs.set_index('question')['correct_answer_id'].to_dict()
    full_scores_df['is_correc_answer'] = full_scores_df.apply(lambda x: 
                                                              1 if x.doc_id in 
                                                              [correct_answers_dict[x.question]] else 0, axis=1) 
    
    print(f'{_i*500}-{(_i+1)*500}\n\n')
    ndcg_list.append(print_metric_results(full_scores_df, score_columns, [1, 5, 10, 20]))



In [27]:
np.array(ndcg_list).shape

(91, 2, 3)

In [28]:
np.mean(ndcg_list, axis = 0) 

array([[0.97546959, 0.97674733, 0.97738175],
       [0.97548501, 0.97684416, 0.97746666]])