In [129]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search
# term and token are synonyms

es = Elasticsearch("http://localhost:9200")

In [10]:
es.info()['version']

{'number': '8.4.3',
 'build_flavor': 'default',
 'build_type': 'docker',
 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
 'build_date': '2022-10-04T07:17:24.662462378Z',
 'build_snapshot': False,
 'lucene_version': '9.3.0',
 'minimum_wire_compatibility_version': '7.17.0',
 'minimum_index_compatibility_version': '7.0.0'}

In [34]:
# GET /_cat/plugins?v=true&s=component&h=name,component,version,description

print(es.cat.plugins(
    params={
        'v': 'true',
        'h': 'name,component,version,description'
    }
))

name         component           version description
8f951096d4f9 analysis-morfologik 8.4.3   Morfologik Polish Lemmatizer plugin for Elasticsearch



In [None]:
# From morfologic repo:
# GET _analyze
# {
#   "tokenizer": "standard",
#   "filter": ["morfologik_stem"],
#   "text": "jestem"
# }

In [209]:
settings = {
    "analysis":{
        "analyzer": {
            "polish_bills_analyzer": { 
               "type":"custom",
               "tokenizer":"standard",
               "filter": [
                  "lowercase",
                   "synonyms_bills",
                   "morfologik_stem",
               ]
            }
        },
        "filter":{
            "synonyms_bills": {
                "type": "synonym_graph",
                "synonyms": [
                    "kpk,kodeks postępowania karnego",
                    "kpc,kodeks postępowania cywilnego",
                    "kk,kodeks karny",
                    "kc,kodeks cywilny"
                ]
            }
        }
    }
}

# To co trafia do inteksu ma zawartość tekstową i będzie analizowane przez polish_bills_analyzer
mappings = {
    "properties": {
        "content": {
            "type": "text",
            "analyzer": "polish_bills_analyzer"
        }
    }
}

In [234]:
es.indices.create(index="polish_bills", settings=settings, mappings=mappings)
# es.indices.delete(index='bills_analyzer')

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'polish_bills'}

In [235]:
import glob

def load_bills(
    path,
    extension= "txt"
):
    paths: List[str] = glob.glob(f"{path}/*.{extension}")
    for file_path in paths:
        with open(file_path) as f:
            yield (
                file_path.split("/")[-1],
                f.read(),
            )

In [236]:
data = []
bills = list(load_bills(path="/Users/mateusz/nlp/bills"))
print(f"{len(bills)} polish bills to upload to the ES index.")

for filename, content in bills:
    data += [{"create": {"_index": "polish_bills", "_id": filename}}]
    data += [{"content": content}]

1178 polish bills to upload to the ES index.


In [239]:
es.bulk(data, index="polish_bills")

In [243]:
if es.count(index="polish_bills")['count'] == len(bills):
    print("Data successfully uploaded.")

Data successfully uploaded.


In [504]:
def es_search(query: str, index="polish_bills"):
    return es.search(index=index, query=query)

def es_search_body(query: str, index="polish_bills"):
    return es.search(index=index, body=query)

In [340]:
def get_ids(index="polish_bills", doc_type="text"):
    s = Search(using=es, index=index, doc_type=doc_type)
    return list(h.meta.id for h in s.scan())

ids = get_ids()
print(ids[:5], end="\n")

['2001_874.txt', '1996_583.txt', '2003_1853.txt', '1997_753.txt', '2000_440.txt']


#### 6. Determine the number of legislative acts containing the word ustawa (in any form).

In [513]:
query = {
    "query": {
        "match": {
            "content": "ustawa"
        }
    },
    "size": 1178, # that should be set, because after some time only 10 docs were taken!
    "_source": False,
    "explain": True
}

result = es_search_body(query)
print(result['hits']['total']['value'])

  return es.search(index=index, body=query)


1178


#### 7.  Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.

In [514]:
# Rezultaty analizy dokumentów per każdy dokument
len(result['hits']['hits'])

1178

In [515]:
result['hits']['hits'][0]['_explanation']['details'][0]['details'][-1]['details']
# The first nested directory represents the total occurences of term within document

[{'value': 156.0,
  'description': 'freq, occurrences of term within document',
  'details': []},
 {'value': 1.2, 'description': 'k1, term saturation parameter', 'details': []},
 {'value': 0.75,
  'description': 'b, length normalization parameter',
  'details': []},
 {'value': 6680.0,
  'description': 'dl, length of field (approximate)',
  'details': []},
 {'value': 4391.8,
  'description': 'avgdl, average length of field',
  'details': []}]

In [516]:
def get_occurences(document):
    return document['_explanation']['details'][0]['details'][-1]['details'][0]['value']

documents = list(result['hits']['hits'])

occurences = int(sum(map(
    lambda x: get_occurences(x),
    documents
)))

print(occurences)

24934


#### Insights into TermVector API

In [352]:
occurences_termv = [
    es.termvectors(
        index="polish_bills",
        id=_id,
        fields=['content'],
        term_statistics=True
    ) for _id in ids 
]

In [353]:
occurences_termv

{'_index': 'polish_bills',
 '_id': '2001_874.txt',
 '_version': 1,
 'found': True,
 'took': 60,
 'term_vectors': {'content': {'field_statistics': {'sum_doc_freq': 705461,
    'doc_count': 1178,
    'sum_ttf': 5173540},
   'terms': {'1': {'doc_freq': 1178,
     'ttf': 73970,
     'term_freq': 1,
     'tokens': [{'position': 23, 'start_offset': 445, 'end_offset': 446}]},
    '106': {'doc_freq': 548,
     'ttf': 2042,
     'term_freq': 3,
     'tokens': [{'position': 76, 'start_offset': 664, 'end_offset': 667},
      {'position': 97, 'start_offset': 746, 'end_offset': 749},
      {'position': 104, 'start_offset': 774, 'end_offset': 777}]},
    '110': {'doc_freq': 316,
     'ttf': 709,
     'term_freq': 1,
     'tokens': [{'position': 117, 'start_offset': 824, 'end_offset': 827}]},
    '1255': {'doc_freq': 96,
     'ttf': 152,
     'term_freq': 1,
     'tokens': [{'position': 119, 'start_offset': 834, 'end_offset': 838}]},
    '152': {'doc_freq': 127,
     'ttf': 209,
     'term_freq': 1,


In [452]:
# Above structure allow to define path to a specific word

def get_occurences_tm(document_id, word, index="polish_bills"):
    """
    The information is only retrieved for the shard the requested document resides in.
    resp looks like: {'term_vectors': {'content': {'terms': {'ustawa': {'ttf': 24934}}}}
    Specify only one document id per shard to retrieve complete statistics about docs in specific shard.
    """
    
    resp = es.termvectors(
        index=index,
        id=document_id,
        fields=['content'], # specified in the mapping section
        filter_path=[f'term_vectors.content.terms.{word}.ttf'],
        term_statistics=True
    )
    
    if resp:
        return resp['term_vectors']['content']['terms'][f"{word}"]['ttf']
    return "Not fetched. Emtpy response from TM API"

In [453]:
get_occurences_tm('2001_874.txt', word="ustawa") # Zadziała, gdyż wszystkie dokumenty są w jednym shard

24934

#### 8. Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.

In [454]:
get_occurences_tm('2001_874.txt', word="ustaw")

'Not fetched. Emtpy response from TM API'

In [455]:
# body – Define analyzer/tokenizer parameters and the text on which the analysis should be performed
r = es.indices.analyze(
    index="polish_bills",
    body={
        'analyzer': "polish_bills_analyzer",
        'text': 'ustaw'}
)

r['tokens']

[{'token': 'ustawa',
  'start_offset': 0,
  'end_offset': 5,
  'type': '<ALPHANUM>',
  'position': 0},
 {'token': 'ustawić',
  'start_offset': 0,
  'end_offset': 5,
  'type': '<ALPHANUM>',
  'position': 0}]

In [456]:
def find_alhpanum_word_form(
    word:str,
    analyzer="polish_bills_analyzer",
    index="polish_bills"
):
    resp = es.indices.analyze(
        index=index,
        body={
            'analyzer': analyzer,
            'text': word
        }
    )

    return [body['token'] for body in resp['tokens']]

In [457]:
ustaw_forms = find_alhpanum_word_form(word='ustaw')
ustaw_forms

['ustawa', 'ustawić']

In [458]:
ustaw_results = dict.fromkeys(ustaw_forms, 0)

for form in ustaw_forms:
    ustaw_results[form] = get_occurences_tm('2001_874.txt', word=form)

In [461]:
get_occurences_tm('2001_874.txt', word="ustawić") 

'Not fetched. Emtpy response from TM API'

In [473]:
resp = es.termvectors(
    index="polish_bills",
    id='2001_874.txt',
    fields=['content'], # specified in the mapping section
    # filter_path=[f'term_vectors.content.terms.ustawić.ttf'],
    term_statistics=True
)

In [474]:
resp['term_vectors']['content']['terms']['ustawić']

KeyError: 'ustawić'

In [478]:
resp = es.termvectors(
    index="polish_bills",
    id='1993_599.txt',
    fields=['content'], # specified in the mapping section
    # filter_path=[f'term_vectors.content.terms.ustawić.ttf'],
    term_statistics=True
)
resp['term_vectors']['content']['terms']['ustawić']

# Conclusion: our base document should contain that word in order to fetch
# its statistics across the entire index.

{'doc_freq': 378,
 'ttf': 913,
 'term_freq': 1,
 'tokens': [{'position': 3112, 'start_offset': 24864, 'end_offset': 24869}]}

In [479]:
for form in ustaw_forms:
    ustaw_results[form] = get_occurences_tm('1993_599.txt', word=form)

In [480]:
ustaw_results

{'ustawa': 24934, 'ustawić': 913}

In [483]:
sum(ustaw_results.values())

25847

#### Classical approach.

In [526]:
query = {
    "query": {
        "match": {
            "content": "ustaw"
        }
    },
    "size": 1178,
    "_source": False,
    "explain": True
}


result = es.search(body=query, index='polish_bills')

documents = list(result['hits']['hits'])

occurences = int(sum(map(
    lambda x: get_occurences(x),
    documents
)))

print(occurences)

  result = es.search(body=query, index='polish_bills')


25847


#### 9. Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form.

In [536]:
query = {
    "query": {
        "match": {
            "content": "kodeks postępowania cywilnego"
        }
    }
}

result = es.count(body=query, index='polish_bills')
print(f"Using es.count: {result['count']}")


query = {
    "query": {
        "match": {
            "content": "kodeks postępowania cywilnego"
        }
    }
}

result = es.search(body=query, index='polish_bills')
print(f"Using es.search: {result['hits']['total']['value']}")

Using es.count: 99
Using es.search: 99


  result = es.search(body=query, index='polish_bills')


#### 10. Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.

In [543]:
no_additional_words = 2

query = {
    "query": {
        "match_phrase": {
            "content": {
                "query": "wchodzi w życie",
                "slop": no_additional_words
            }
        }
    }
}

result = es.count(body=query, index='polish_bills')
print(f"Using es.count: {result['count']}")

Using es.count: 1174


#### 11. Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [546]:
query = {
    "match": {
        "content": "konstytucja"
    }
}

resp = es.search(
    index="polish_bills",
    query=query,
    filter_path=["hits.hits._id", "hits.hits._score"],
    size=10
)

resp['hits']['hits']

[{'_id': '1997_629.txt', '_score': 6.867639},
 {'_id': '2000_443.txt', '_score': 6.6627645},
 {'_id': '1997_604.txt', '_score': 6.632078},
 {'_id': '1996_350.txt', '_score': 6.626821},
 {'_id': '1997_642.txt', '_score': 6.2516394},
 {'_id': '2001_23.txt', '_score': 6.057972},
 {'_id': '1996_199.txt', '_score': 5.9280615},
 {'_id': '1999_688.txt', '_score': 5.849732},
 {'_id': '2001_1082.txt', '_score': 5.466578},
 {'_id': '1997_681.txt', '_score': 5.466578}]

#### 12. print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.

In [550]:
query = {
    "match": {
        "content": "konstytucja"
    }
}

highlight = {
    "fields": {
        "content": {
           "number_of_fragments" : 3
        }
    }
}

resp = es.search(
    index="polish_bills",
    query=query,
    highlight=highlight,
    filter_path=["hits.hits._id", "hits.hits.highlight"],
    size=10
)

from pprint import pprint
pprint(resp['hits']['hits'])

[{'_id': '1997_629.txt',
  'highlight': {'content': ['o zmianie ustawy konstytucyjnej o trybie '
                            'przygotowania\n'
                            '           i uchwalenia <em>Konstytucji</em> '
                            'Rzeczypospolitej',
                            'W ustawie  konstytucyjnej z  dnia 23 kwietnia '
                            '1992 r. o trybie przygotowania i \n'
                            'uchwalenia <em>Konstytucji</em>',
                            'Do zgłoszenia projektu <em>Konstytucji</em> '
                            'załącza się wykaz \n'
                            '                obywateli popierających '
                            'zgłoszenie']}},
 {'_id': '2000_443.txt',
  'highlight': {'content': ['umowy międzynarodowej i nie wypełnia przesłanek '
                            'określonych w art. 89\n'
                            '     ust. 1 lub art. 90 <em>Konstytucji</em>',
                            'międzynarodowej lub za