In [22]:
import utils
import requests

text_by_file = utils.load_files('./ustawy/*')
ES_ADDRESS = 'http://localhost:9200'

1. Install ElasticSearch (ES).
2. Install an ES plugin for Polish.

[Dockerfile:](./Dockerfile)
```docker
FROM docker.elastic.co/elasticsearch/elasticsearch:8.4.3

RUN bin/elasticsearch-plugin install pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:8.4.3

ENTRYPOINT [ "/usr/local/bin/docker-entrypoint.sh","elasticsearch" ]
```

[bash script:](./es.sh)
```bash
#!/usr/bin/env bash

ES_CONTAINER="es-morfologik:latest"

docker build -t $ES_CONTAINER .

docker run \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
   $ES_CONTAINER
```

3. Define an ES analyzer for Polish texts containing:
   1. standard tokenizer
   1. synonym filter with the following definitions:
      1. kpk - kodeks postępowania karnego
      1. kpc - kodeks postępowania cywilnego
      1. kk - kodeks karny
      1. kc - kodeks cywilny
   1. Morfologik-based lemmatizer
   1. lowercase filter

In [23]:
synonym_filter = {
    "synonym": {
        "type": "synonym",
        "synonyms": [
            "kpk => kodeks postępowania karnego",
            "kpc => kodeks postępowania cywilnego",
            "kk => kodeks karny",
            "kc => kodeks cywilny"
        ]
    }
}

analyzer = {
    "polish_text_analyzer": {
        "type": "custom",
        "tokenizer": "standard",
        "filter": [
            "lowercase",
            "synonym",
            "morfologik_stem"
        ]
    }
}

4. Define an ES index for storing the contents of the legislative acts.

In [24]:
index = {
  "settings": {
    "index": {
      "analysis": {
        "analyzer": analyzer,
        "filter": synonym_filter
      }
    }
  },
  "mappings": {
    "properties": {
      "file": { "type": "text" },  
      "contents": { 
        "type": "text",
        "analyzer": "polish_text_analyzer"  
      }     
    }
  }
}
index_url = f'{ES_ADDRESS}/ustawy'

r = requests.put(index_url, json=index) 
r.text

'{"acknowledged":true,"shards_acknowledged":true,"index":"ustawy"}'

5. Load the data to the ES index.

In [25]:
bulk = ''

for file,text in text_by_file.items():
    text = ' '.join(text.split()).replace('\\',r'\\').replace('\"',r'\"')
    bulk += '{"index":{}}\n{"file":"' + file + '","contents":"' + text + '"}\n'

headers = {'Content-Type': 'application/x-ndjson'}
bulk_url = index_url + '/_bulk'

r = requests.post(bulk_url, data=bulk.encode('utf-8'), headers=headers)
r.json()['errors']

False

6. Determine the number of legislative acts containing the word **ustawa** (in any form).

In [26]:
search_url = index_url + '/_count'

query = {
    "query": {
        "match": {
            "contents" : "ustawa",
        }
    }
}

r = requests.get(search_url, json=query)
print(r.json()['count'])

1178


7. Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms.

In [28]:
search_url = index_url + '/_search'

query = {
    "query": {
        "match": {
            "contents" : "ustawa",
        }
    }
}

r = requests.get(search_url, json=query)
doc_id = r.json()['hits']['hits'][0]['_id']

termvec_url = index_url + '/_termvectors/' + doc_id

query = {
  "fields" : ["contents"],
  "offsets" : False,
  "payloads" : False,
  "positions" : False,
  "term_statistics" : True,
  "field_statistics" : False
}

r = requests.get(termvec_url,json=query)
r.json()['term_vectors']['contents']['terms']['ustawa']['ttf']

24934

8. Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms.

In [29]:
search_url = index_url + '/_search'

query = {
    "query": {
        "match": {
            "contents" : "ustaw",
        }
    }
}

r = requests.get(search_url, json=query)
doc_id = r.json()['hits']['hits'][0]['_id']

termvec_url = index_url + '/_termvectors/' + doc_id

query = {
  "fields" : ["contents"],
  "offsets" : False,
  "payloads" : False,
  "positions" : False,
  "term_statistics" : True,
  "field_statistics" : False
}

r = requests.get(termvec_url,json=query)
r.json()['term_vectors']['contents']['terms']['ustawić']['ttf']

913

9. Determine the number of legislative acts containing the words **kodeks postępowania cywilnego** in the specified order, but in any inflection form.

In [36]:
search_url = index_url + '/_count'

query = {
    "query": {
        "match_phrase": {
            "contents": {
                "query": "kodeks postępowania cywilnego",
                "analyzer": "polish_text_analyzer"
            } 
        }
    }
}

r = requests.get(search_url, json=query)
print(r.json()['count'])

'{"count":99,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0}}'

10. Determine the number of legislative acts containing the words **wchodzi w życie** (in any form) allowing for up to 2 additional words in the searched phrase.

In [66]:
search_url = index_url + '/_count'

query = {
    "query": {
        "match_phrase": {
            "contents" : {
                "query": "wchodzi w życie",
                "analyzer": "polish_text_analyzer",
                "slop": 2
            }
        }
    }
}

r = requests.get(search_url, json=query)
print(r.json()['count'])

1174


11. Determine the 10 documents that are the most relevant for the phrase **konstytucja**.

In [65]:
search_url = index_url + '/_search'

query = {
    "query": {
        "match": {
            "contents" : "konstytucja",
        }
    },
    "size": 1178,
    "track_scores": True
}

r = requests.get(search_url, json=query)
scores = list(map(lambda hit: {'_id': hit['_id'], 'file': hit["_source"]["file"],'score': hit['_score']}, r.json()['hits']['hits']))
print(len(scores))
best_10 = list(sorted(scores, key=lambda doc: doc['score'], reverse=True))[:10]
for file in best_10:
    print(file)


45
{'_id': 'llPpQYQBX6EqmV7Eh07l', 'file': './ustawy/1997_629.txt', 'score': 6.867635}
{'_id': 'u1PpQYQBX6EqmV7Eh0va', 'file': './ustawy/2000_443.txt', 'score': 6.662749}
{'_id': 'clPpQYQBX6EqmV7Eh0vY', 'file': './ustawy/1997_604.txt', 'score': 6.6320543}
{'_id': 'cFPpQYQBX6EqmV7Eh03i', 'file': './ustawy/1996_350.txt', 'score': 6.626803}
{'_id': 'nVPpQYQBX6EqmV7Eh0vZ', 'file': './ustawy/1997_642.txt', 'score': 6.251624}
{'_id': 'dlPpQYQBX6EqmV7Eh07l', 'file': './ustawy/2001_23.txt', 'score': 6.0579295}
{'_id': 'MFPpQYQBX6EqmV7Eh03h', 'file': './ustawy/1996_199.txt', 'score': 5.928016}
{'_id': 'H1PpQYQBX6EqmV7Eh03h', 'file': './ustawy/1999_688.txt', 'score': 5.8496947}
{'_id': 'VlPpQYQBX6EqmV7Eh03i', 'file': './ustawy/1997_681.txt', 'score': 5.466536}
{'_id': '4VPpQYQBX6EqmV7Eh03j', 'file': './ustawy/2001_1082.txt', 'score': 5.466536}


12. Print the excerpts containing the word **konstytucja** (up to three excerpts per document) from the previous task.

In [63]:
search_url = index_url + '/_search'
ids = list(map(lambda doc: doc['_id'], best_10))

query = {
    "query": {
        "bool":{
            "must": [
                {
                    "match": {
                        "contents" : "konstytucja",
                    }
                },
                {
                    "ids": {
                        "values": ids
                    }
                }
            ]
        }
    },
    "highlight":{
      "fragment_size":10,
      "fields":{
         "contents":{}
      }
   }
}

r = requests.get(search_url, json=query)
for hit in r.json()['hits']['hits']:
    highlights = hit['highlight']['contents']
    for highlight in highlights[:3]:
        print(f'file: {hit["_source"]["file"]} highlight: {highlight}')

file: ./ustawy/1997_629.txt highlight: uchwalenia <em>Konstytucji</em>
file: ./ustawy/1997_629.txt highlight: uchwalenia <em>Konstytucji</em>
file: ./ustawy/1997_629.txt highlight: projektowi <em>Konstytucji</em>
file: ./ustawy/2000_443.txt highlight: i art. 90 <em>Konstytucji</em>
file: ./ustawy/2000_443.txt highlight: i art. 90 <em>Konstytucji</em>
file: ./ustawy/2000_443.txt highlight: 89 ust. 2 <em>Konstytucji</em>
file: ./ustawy/1997_604.txt highlight: zgodności z <em>Konstytucją</em>
file: ./ustawy/1997_604.txt highlight: politycznej z <em>Konstytucją</em>
file: ./ustawy/1997_604.txt highlight: politycznej z <em>Konstytucją</em>
file: ./ustawy/1996_350.txt highlight: naruszenie <em>Konstytucji</em>
file: ./ustawy/1996_350.txt highlight: art. 107 <em>Konstytucji</em>
file: ./ustawy/1996_350.txt highlight: naruszenie <em>Konstytucji</em>
file: ./ustawy/1997_642.txt highlight: międzynarodowych z <em>Konstytucją</em>
file: ./ustawy/1997_642.txt highlight: państwowe, z <em>Konstytucją