# Data validation
In this notebook we test our data and provide some basic analysis on the data we work with.

First, we verify that all documents which we use for query ranking are in the ElasticSearch index.

## 1. Count the number of documents

In [14]:
ES_INDEX_NAME = 'freeze5-test'

In [15]:
import json
import pprint
import logging
import os
import pandas as pd
from collections import defaultdict, Counter
from elasticsearch import Elasticsearch

import util

# the elasticsearch data and queries are split into languages.
LANGUAGES = ['en', 'nl']

INDIECES = {lang: f'{ES_INDEX_NAME}-{lang}' for lang in LANGUAGES}
QUERY_FILES = {lang: f'queries/queries-{lang}.json' for lang in LANGUAGES}
logging.basicConfig(format='[%(asctime)-15s][%(levelname)-7s] %(message)s',
                        level=logging.INFO)

ES_CREDENTIALS = 'es-credentials.json'
es_client = util.get_es_client(ES_CREDENTIALS)
DOC_TYPE = '_doc'
print(INDIECES)
print(QUERY_FILES)

[2019-05-02 14:18:48,581][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/_cat/health [status:200 request:0.044s]


{'en': 'freeze5-test-en', 'nl': 'freeze5-test-nl'}
{'en': 'queries/queries-en.json', 'nl': 'queries/queries-nl.json'}


Let us check the number of documents we have in the freeze. We need to compare this number with the logged by the pol-harvester when creating the index.

In [16]:
def get_indices(es_client):
    return es_client.cat.indices(format="json", v=True)
indieces = get_indices(es_client)
es_doc_count = 0
for index in indieces:
    if index['index'] in INDIECES.values():
        print(index)
        es_doc_count += int(index['docs.count'])
es_doc_count

[2019-05-02 14:18:53,594][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/_cat/indices?format=json&v=true [status:200 request:0.035s]


{'health': 'green', 'status': 'open', 'index': 'freeze5-test-nl', 'uuid': 'aHUbTssGQDGn5nXHSxYfLA', 'pri': '1', 'rep': '0', 'docs.count': '4129', 'docs.deleted': '8', 'store.size': '23.8mb', 'pri.store.size': '23.8mb'}
{'health': 'green', 'status': 'open', 'index': 'freeze5-test-en', 'uuid': 'S1rv8GGET0KSDfANbtEalw', 'pri': '1', 'rep': '0', 'docs.count': '1689', 'docs.deleted': '202', 'store.size': '10.2mb', 'pri.store.size': '10.2mb'}


5818

## 2. Verify that the documents which are queried are there

In [18]:
def read_queries(query_file):
    with open(query_file, 'rt') as file:
        return json.load(file)
QUERIES = {lang: read_queries(QUERY_FILES[lang]) for lang in LANGUAGES}

## Check if queries are split correctly

In [19]:
def are_queries_in_es(lang, queries, index):
    missing_docs = {}
    for query in queries:
        for doc in query['items']:
            try:
                response = es_client.get(index=index, doc_type=DOC_TYPE, id=doc['hash'])
            except Exception as e:
                missing_docs[doc['hash']] = {'query': query["queries"][0], 'lang': lang}
    return missing_docs

In [20]:
es_missing = [are_queries_in_es(lang, QUERIES[lang], INDIECES[lang]) for lang in LANGUAGES]
es_missing

[2019-05-02 14:20:40,760][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/18604a456db12d50da67af95f596ab795ea4eb1a [status:200 request:0.081s]
[2019-05-02 14:20:40,774][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/8c87b0c6111df4d2fe8633a8b274851b2ec4a2c0 [status:200 request:0.012s]
[2019-05-02 14:20:40,799][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/fe118ba3f8b3d49a0152f7e1f1bdfbb31490b808 [status:200 request:0.023s]
[2019-05-02 14:20:40,812][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/eb6c2ae9d0f29b7de72a6dc6acfa96a3dcff111f [status:200 request:0.012s]
[2019-05-02 14:20:40,833][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/a000df1ed8e28fc1f9458231ac9fe60c0debe396 [status:200 request:0.018s]
[2019-05-02 14:20:40,853][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/18604a456db12d50da67af95f596ab795ea4eb1a [status:200 request:

[2019-05-02 14:20:41,481][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/cf92f2726f45908b33d006de64d5f42153dcc1ca [status:200 request:0.009s]
[2019-05-02 14:20:41,488][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/f5a6dbdd62e9e329ca92066ea1a592defd5c7f04 [status:200 request:0.007s]
[2019-05-02 14:20:41,508][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/eef2df3658e0eea6da1bf3b778a05242601f8ac5 [status:200 request:0.020s]
[2019-05-02 14:20:41,528][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/401035affeb86d1dcba441593b2ff22d78e9910a [status:200 request:0.017s]
[2019-05-02 14:20:41,543][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/330d36593b97ec9864b9557f8a0a4b3c400d4a06 [status:200 request:0.014s]
[2019-05-02 14:20:41,564][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-en/_doc/a329d8328109c884b83a372e3c7ef6889d8c7a5c [status:200 request:

[2019-05-02 14:20:42,224][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/0dd9e946f3aca97d3e7f09df68efc35a114674e7 [status:200 request:0.043s]
[2019-05-02 14:20:42,284][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/eff32a56fad71d67afe3d4fbcaffa42e1a9135f6 [status:200 request:0.055s]
[2019-05-02 14:20:42,323][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/4c89bc1b0f5c76b6b58fbd13e590c0921a9c42f8 [status:200 request:0.037s]
[2019-05-02 14:20:42,330][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/abb17d5655caa8c1be59e9ed278ad306b0ba5269 [status:200 request:0.005s]
[2019-05-02 14:20:42,352][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/a80adcbe6de881b98115862847a13a5b99b62077 [status:200 request:0.021s]
[2019-05-02 14:20:42,362][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/600a70a38d380768a6fc8728389688d94e1efbc3 [status:200 request:

[2019-05-02 14:20:42,936][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/e8064096f14ec6d252ba5bccdedcddd9ef1a6734 [status:200 request:0.008s]
[2019-05-02 14:20:42,945][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/c72334bcb5f4c517ecacdb76b0654c2725c47ff9 [status:200 request:0.007s]
[2019-05-02 14:20:42,952][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/6f6ae1d8cd6a2cc541620ded919f9f317466cdd5 [status:200 request:0.005s]
[2019-05-02 14:20:42,957][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/d6c8ce2beb547aac34a63c3b1621aed50f7515d0 [status:200 request:0.005s]
[2019-05-02 14:20:42,964][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/01ca8a7f469196922a1660c277f4b07d5fe9d794 [status:200 request:0.005s]
[2019-05-02 14:20:42,977][INFO   ] GET https://surfpol.sda.surf-hosted.nl:443/freeze5-test-nl/_doc/924612189b319da509fd65a750bbaf67a71185e3 [status:200 request:

[{},
 {'aa508762def8fb9f34ed7e0186eca1c4207e7298': {'query': 'leren leren',
   'lang': 'nl'},
  'd610580ea5cea9d1cc0889d9e52ef5add8c1fc31': {'query': 'noteren van muziek',
   'lang': 'nl'},
  '15350fb67a0634c6612e5a575fed67b75409e365': {'query': 'didactiek',
   'lang': 'nl'},
  'dc55a6bb1c525b3494c07d0557f557605ffec7c2': {'query': 'didactiek',
   'lang': 'nl'}}]

In [33]:
def explain(id, query):
    print(es_client.explain(index=INDEX, doc_type=DOC_TYPE, id=id, query=query))

In [36]:
def search(query, **kwargs):
    pprint.pprint(es_client.search(index=INDEX, doc_type=DOC_TYPE, body=query, **kwargs))

In [37]:
usersSearch = 'injecties geven'
Fields = ['title^2', 'text']

query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': Fields, #B
        },
    },
  #  'size': '100'
}
search(query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '34adeddc05423967fd2b70e1068581ba87c0d547',
                    '_index': 'freeze3-2-nl',
                    '_score': 15.291727,
                    '_source': {'arrangement_collection_name': 'wwmhbo',
                                'conformed_mime_type': 'html',
                                'keywords': [],
                                'mime_type': 'application/xhtml+xml',
                                'text': ' Water geven ',
                                'text_plain': ' Water geven ',
                                'title': 'Water geven',
                                'title_plain': 'Water geven',
                                'url': 'https://maken.wikiwijs.nl/81245/PBAN7_Bedrijfsgebouwen_tuinbouw/item6f8ffcb899cc385a.html'},
                    '_type': '_doc'},
                   {'_id': '41e6906f81c934c40083ea947f5e029007ab5600',
                    '_index': 'freeze3

                                        'dan de MantelScan (DOC15.4) af bij '
                                        'iemand die je in je eigen omgeving '
                                        'kent die mantelzorger is. Wanneer je '
                                        'dit hebt gedaan heb je veel '
                                        'informatie over de zorgsituatie '
                                        'gekregen. Ook de knelpunten zijn '
                                        'wellicht naar boven gekomen. Ook heb '
                                        'je door de MantelScan inzicht '
                                        'gekregen in de ervaren belasting van '
                                        'de mantelzorger. De MantelScan bevat '
                                        'namelijk 2 vragenlijsten die ook in '
                                        'Tabel 15.3 zijn behandeld (de SRBS en '
                                        'de EDIZ). Bedenk welk gerich

In [None]:
raw

In [72]:
explain(id='3605374af3eedfa7084c17cbea6d0fce3862bfb0')

GET https://surfpol.sda.surf-hosted.nl:443/freeze3/doc/3605374af3eedfa7084c17cbea6d0fce3862bfb0/_explain [status:400 request:0.047s]


RequestError: RequestError(400, 'action_request_validation_exception', 'Validation Failed: 1: query is missing;')