In [None]:
import json
import pprint
import os
from collections import defaultdict
from elasticsearch import Elasticsearch

import util

ES_CREDENTIALS = 'es-credentials.json'
RAW_DATA_FOLDER = 'data/freeze3/output'
ES_FOLDER = 'data/freeze3/elasticsearch' #see "pol-harvester/elasticsearch/readme.md", read_freeze.py
INDEX = 'freeze3'
LANGUAGES = ['en', 'nl']
INDIECES = {lang: f'{INDEX}-{lang}' for lang in LANGUAGES}
QUERIES = {lang: f'queries/queries-{lang}.json' for lang in LANGUAGES}
ES_DATA_FOLDERS = {lang: f'{ES_FOLDER}/{lang}' for lang in LANGUAGES}

In [None]:
es_client = util.get_es_client(ES_CREDENTIALS)

In [None]:
def get_indices(es_client):
    print(es_client.cat.indices(format="b", v=True))

In [None]:
get_indices(es_client)

First we check if all the documents in the queries are there

In [None]:
def are_queries_in_raw(lang, query_file):
    missing_docs = {}
    with open(query_file, 'rt') as file:
        for query in json.load(file):
            for item in query['items']:
                if util.get_raw_document(item['hash'], RAW_DATA_FOLDER) is None:
                    missing_docs[item['hash']] = {'query': query["queries"][0], 'lang': lang}
    return missing_docs

In [None]:
raw_missing = [are_queries_in_raw(lang, QUERIES[lang]) for lang in LANGUAGES]
raw_missing

In [None]:
def are_queries_in_es(lang, query_file, folder):
    missing_docs = {}
    with open(query_file, 'rt') as file:
        for query in json.load(file):
            for item in query['items']:
                if util.get_es_document(item['hash'], folder) is None:
                    missing_docs[item['hash']] = {'query': query["queries"][0], 'lang': lang}
    return missing_docs

In [None]:
es_missing = [are_queries_in_es(lang, QUERIES[lang], ES_DATA_FOLDERS[lang]) for lang in LANGUAGES]
es_missing

In [47]:
def get_identifier_raw(id, identifier):
    if util.get_raw_document(id, RAW_DATA_FOLDER) is not None:
        return util.get_raw_document(id, RAW_DATA_FOLDER)[identifier]

In [48]:
def get_lang_misclassified_docs(missing_dict):
    lang_misclassified_docs = {}
    for id in missing_dict.keys():
        raw_lang = get_identifier_raw(id, 'language')
        query_lang = missing_dict[id]['lang']
        if query_lang != raw_lang:
            lang_misclassified_docs[id] = {'raw_lang': raw_lang, 'query_lang': query_lang}
    return lang_misclassified_docs

In [49]:
[get_lang_misclassified_docs(missing_lang) for missing_lang in es_missing]

[{},
 {'0d33bf36292b5167be043d5d6fc87ce04a6b8daa': {'raw_lang': None,
   'query_lang': 'nl'},
  '2f4c55ef5047103d2b77e9b01e2a2d49c596ebc0': {'raw_lang': 'en',
   'query_lang': 'nl'},
  '4e64085e43f526be3a804537383ba63381a2f545': {'raw_lang': 'en',
   'query_lang': 'nl'},
  '7701df9213c5e144e4017c269c2f745cd9c7589d': {'raw_lang': 'en',
   'query_lang': 'nl'},
  '600a70a38d380768a6fc8728389688d94e1efbc3': {'raw_lang': 'en',
   'query_lang': 'nl'},
  'd9206d6ac4ab1a09be72523a846cd4f3db543afd': {'raw_lang': 'en',
   'query_lang': 'nl'}}]

In [62]:
util.get_raw_document('dbee97270c21023e12646cf528f01352f383fbcb', RAW_DATA_FOLDER)

{'title': 'Evalueren om te leren',
 'language': 'nl',
 'url': 'https://www.leraar24.nl/evalueren-om-te-leren-2/',
 'text': "Aan.\nEn leraren besteden veel tijd aan het beoordelen van het werk van leerlingen beoordelen in het onderwijs heeft verschillende functies namelijk voorstellen wat leerlingen hebben opgestoken van het onderwijs het beoordelen van het leren en leerlingen bewust maken van hun eigen leerproces beoordelen om te leren.\nBeoordelen om te leren is niet alleen een zaak die van leraren een actieve rol vraagt maar ook leerlingen.\nBasisschool koningshof werkt volgens de aanpak evalueren om te leren om kinderen actief bij hun leerproces te betrekken.\nEvalueren om te leren houdt in dat je als leraar samen met leerlingen tijdens een les informatie over leren verzameld en dat samen interpreteert en de bedoeling daarvan is dat dat kinderen zich bewust worden van van moest leren en wat hen helpt bij het leren en hoe ze dat leren kunnen verbeteren.\nVol vandaag nathan wel wat je

In [61]:
nl_queries_in_en = [are_queries_in_es('nl_in_en', QUERIES['nl'], ES_DATA_FOLDERS['en']) ]
nl_queries_in_en

[{'dbee97270c21023e12646cf528f01352f383fbcb': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '76f47e8cb4767ac5644c57441636a86f16addf64': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  'aa508762def8fb9f34ed7e0186eca1c4207e7298': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '9138f0cf63eb62848bda8a2c7475011f0076bd96': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '5d7f20bc33675e7b4cf5072e856e817bf791c409': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  'a80adcbe6de881b98115862847a13a5b99b62077': {'query': 'didactiek',
   'lang': 'nl_in_en'},
  '2d556134bdf44b30de1192331ff99c5be20f9989': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '096e5cd027af3a71586ab0d80ad0f60ab7d83724': {'query': 'didactiek',
   'lang': 'nl_in_en'},
  'f38af0f8c004ff6ddf557c3f2ecb0e5508655236': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '0d33bf36292b5167be043d5d6fc87ce04a6b8daa': {'query': 'leren leren',
   'lang': 'nl_in_en'},
  '66eb37bb5a84a644c9cc09f1c7cf94ad21b4cb72': {'query'

In [59]:
[get_identifier_raw(key, 'mime_type') for missing in es_missing for key in missing.keys()]

['text/html',
 'text/html',
 'text/html',
 'text/html',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'text/html',
 None,
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'video/x-flv',
 'application/pdf',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/pdf',
 'application/pdf',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',

In [None]:
usersSearch = 'depression'
Fields = ['title^5', 'text.nl']

query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': Fields, #B
        },
    },
  #  'size': '100'
}