In [12]:
import json
import pprint
import os
import pandas as pd
from collections import defaultdict
from elasticsearch import Elasticsearch

import util

ES_CREDENTIALS = 'es-credentials.json'
RAW_DATA_FOLDER = 'data/freeze_04/'
ES_FOLDER = 'data/elasticsearch_freeze4' #see "pol-harvester/elasticsearch/readme.md", read_freeze.py
INDEX = 'freeze4'
LANGUAGES = ['en', 'nl']
INDIECES = {lang: f'{INDEX}-{lang}' for lang in LANGUAGES}
QUERY_FILES = {lang: f'queries/queries-{lang}.json' for lang in LANGUAGES}
ES_DATA_FOLDERS = {lang: f'{ES_FOLDER}/{lang}' for lang in LANGUAGES}

## Check if all the queries are in the freeze
First we check if all the documents in the queries are there

In [13]:
def read_queries(query_file):
    with open(query_file, 'rt') as file:
        return json.load(file)
QUERIES = {lang: read_queries(QUERY_FILES[lang]) for lang in LANGUAGES}

In [14]:
def are_queries_in_raw(lang, queries):
    missing_docs = {}
    for query in queries:
        for doc in query['items']:
            if util.get_raw_document(doc['hash'], RAW_DATA_FOLDER) is None:
                missing_docs[doc['hash']] = {'query': query["queries"][0], 'lang': lang}
    return missing_docs

In [15]:
raw_missing = [are_queries_in_raw(lang, QUERIES[lang]) for lang in LANGUAGES]
raw_missing

[{},
 {'aa508762def8fb9f34ed7e0186eca1c4207e7298': {'query': 'leren leren',
   'lang': 'nl'}}]

### Some useful functions

In [16]:
def get_identifier_raw(id, identifier):
    """
    Finds a document with 'id' and retrieves the property 'identifier' or None.
    """
    if util.get_raw_document(id, RAW_DATA_FOLDER) is not None:
        return util.get_raw_document(id, RAW_DATA_FOLDER)[identifier]

## Check if queries are split correctly

To begin with we read in the queries and see:
1. The language of the raw document.
2. The text of the raw document.

In [17]:
query_split_view = [
    {
        'id': item['hash'],
        'raw_lang': get_identifier_raw(item['hash'], 'language')['from_text'] if get_identifier_raw(item['hash'], 'language') is not None else '',
        'query_lang': lang,
        'title': get_identifier_raw(item['hash'], 'title'),
        'text_snippet': get_identifier_raw(item['hash'], 'text')[:100] if get_identifier_raw(item['hash'], 'text') is not None else ''
    }
    for lang in LANGUAGES 
    for query in QUERIES[lang] 
    for item in query['items']
]

query_view = pd.DataFrame(query_split_view)

In [18]:
incorrect_lang = query_view[query_view['raw_lang'] != query_view['query_lang']]
incorrect_lang

Unnamed: 0,id,query_lang,raw_lang,text_snippet,title
64,aa508762def8fb9f34ed7e0186eca1c4207e7298,nl,,,
93,d610580ea5cea9d1cc0889d9e52ef5add8c1fc31,nl,en,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n,Overzicht bereik en transposities veel voorkom...
130,15350fb67a0634c6612e5a575fed67b75409e365,nl,en,\n\n,Didactiek van wiskundig denken
131,dc55a6bb1c525b3494c07d0557f557605ffec7c2,nl,en,\n\n,Didactiek van wiskundig denken


In [8]:
util.get_raw_document('9635ef227d46b21fd762208d33b217b37d5dc048', RAW_DATA_FOLDER)

{'id': '9635ef227d46b21fd762208d33b217b37d5dc048',
 'title': '4.5.2 De docent toont aan dat hij het leren leren van zijn leerlingen ondersteunt en bevordert met behulp van ICT.',
 'language': 'nl',
 'url': 'https://maken.wikiwijs.nl/95860/ICT_didactiek_portfolio/item86ead13d6a7fb8cd.html',
 'text': '\n4.5.2 De docent toont aan dat hij het leren leren van zijn leerlingen ondersteunt en bevordert met behulp van ICT.\n\nVanaf volgend jaar werken we op school met het programma Peppels. Dit is een digitaal portfolio.\n\nLeerlingen loggen in, maken hun eigen persoonlijke pagina aan en\xa0kunnen persoonlijke doelen opstellen en uitwerken\xa0Als mentor kan ik opdrachten voor hen klaar zetten, gericht op bijvoorbeeld studievaardigheden. Ik kan daarbij aangeven hoe ik wil dat de leerlingen deze opdracht inleveren en of er een beoordeling aan hangt.\n\nIk kan de leerlingen volgen en zij kunnen elkaar volgen.\n\nAfgelopen jaar hebben we een pilot gedraaid. Leerlingen pakken dit programma snel op, 

## Check if queries are in correct ES folder
Given that the queries have been split correctly to languages, we check the corresponding elasticsearch documents.

In [19]:
def are_queries_in_es(lang, queries, folder):
    ids = set()
    documents = util.read_documents(folder)
    for doc in documents:
        ids.add(doc['id'])
    missing_docs = {}
    for query in queries:
        for doc in query['items']:
            if doc['hash'] not in ids:
                missing_docs[doc['hash']] = {'query': query["queries"][0], 'lang': lang}
    return missing_docs

In [20]:
es_missing = [are_queries_in_es(lang, QUERIES[lang], ES_DATA_FOLDERS[lang]) for lang in LANGUAGES]
es_missing

[{},
 {'aa508762def8fb9f34ed7e0186eca1c4207e7298': {'query': 'leren leren',
   'lang': 'nl'},
  'd610580ea5cea9d1cc0889d9e52ef5add8c1fc31': {'query': 'noteren van muziek',
   'lang': 'nl'},
  '15350fb67a0634c6612e5a575fed67b75409e365': {'query': 'didactiek',
   'lang': 'nl'},
  'dc55a6bb1c525b3494c07d0557f557605ffec7c2': {'query': 'didactiek',
   'lang': 'nl'}}]

These should be the same documents as above.

In [31]:
es_client = util.get_es_client(ES_CREDENTIALS)
# here it is nice to set the language to a specific language
LANG = 'nl'
INDEX = INDIECES[LANG]
print(INDEX)
DOC_TYPE = '_doc'

freeze3-2-nl


In [32]:
def get_indices(es_client):
    print(es_client.cat.indices(format="b", v=True))
# get_indices(es_client)

In [33]:
def explain(id, query):
    print(es_client.explain(index=INDEX, doc_type=DOC_TYPE, id=id, query=query))

In [36]:
def search(query, **kwargs):
    pprint.pprint(es_client.search(index=INDEX, doc_type=DOC_TYPE, body=query, **kwargs))

In [37]:
usersSearch = 'injecties geven'
Fields = ['title^2', 'text']

query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': Fields, #B
        },
    },
  #  'size': '100'
}
search(query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '34adeddc05423967fd2b70e1068581ba87c0d547',
                    '_index': 'freeze3-2-nl',
                    '_score': 15.291727,
                    '_source': {'arrangement_collection_name': 'wwmhbo',
                                'conformed_mime_type': 'html',
                                'keywords': [],
                                'mime_type': 'application/xhtml+xml',
                                'text': ' Water geven ',
                                'text_plain': ' Water geven ',
                                'title': 'Water geven',
                                'title_plain': 'Water geven',
                                'url': 'https://maken.wikiwijs.nl/81245/PBAN7_Bedrijfsgebouwen_tuinbouw/item6f8ffcb899cc385a.html'},
                    '_type': '_doc'},
                   {'_id': '41e6906f81c934c40083ea947f5e029007ab5600',
                    '_index': 'freeze3

                                        'dan de MantelScan (DOC15.4) af bij '
                                        'iemand die je in je eigen omgeving '
                                        'kent die mantelzorger is. Wanneer je '
                                        'dit hebt gedaan heb je veel '
                                        'informatie over de zorgsituatie '
                                        'gekregen. Ook de knelpunten zijn '
                                        'wellicht naar boven gekomen. Ook heb '
                                        'je door de MantelScan inzicht '
                                        'gekregen in de ervaren belasting van '
                                        'de mantelzorger. De MantelScan bevat '
                                        'namelijk 2 vragenlijsten die ook in '
                                        'Tabel 15.3 zijn behandeld (de SRBS en '
                                        'de EDIZ). Bedenk welk gerich

In [72]:
explain(id='3605374af3eedfa7084c17cbea6d0fce3862bfb0')

GET https://surfpol.sda.surf-hosted.nl:443/freeze3/doc/3605374af3eedfa7084c17cbea6d0fce3862bfb0/_explain [status:400 request:0.047s]


RequestError: RequestError(400, 'action_request_validation_exception', 'Validation Failed: 1: query is missing;')