In [70]:
%pip install elasticsearch==7.17.6

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [71]:
try:
    import importlib
    from utils import load_all_bills
except Exception as e:
    print(e)

In [72]:
import requests


In [73]:
from elasticsearch import Elasticsearch

In [74]:
client = Elasticsearch("http://localhost:9200")

In [75]:
client.indices.analyze(body={
    "analyzer": "morfologik",
    "text": "kpk"
})

{'tokens': [{'token': 'kpk',
   'start_offset': 0,
   'end_offset': 3,
   'type': '<ALPHANUM>',
   'position': 0}]}

In [76]:
my_analizer = {
    "tokenizer": "standard",
    "filter": [
        "synonym",
        "morfologik_stem",
        "lowercase"
    ],
}

In [77]:
synonym_filter = {
    "type": "synonym",
    "synonyms": [
                    "kodeks postępowania karnego,kpk",
                    "kodeks postępowania cywilnego,kpc",
                    "kodeks karny,kk",
                    "kodeks cywilny,kc"
                ],
    "lenient": True
}

In [78]:
settings = {
    "index":{
        "analysis": {
            "analyzer": {
                "my_analyzer": my_analizer
            },
            "filter": {
                "synonym": synonym_filter
            }
        }
    }
}


In [79]:
mapping = {
    "properties": {
      "text": { 
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
}

In [80]:
my_index = "teksty-prawne"

In [81]:
client.indices.delete(index=my_index, ignore=[400, 404])

{'acknowledged': True}

In [82]:
client.indices.create(index=my_index, settings=settings, mappings=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'teksty-prawne'}

In [83]:
bills = load_all_bills()

In [84]:
import re
new_bills = dict()

for bill_file_name, bill_text in bills.items():
    new_text = re.sub(r'\s+', ' ', bill_text)
    new_bills[bill_file_name] = new_text

bills = new_bills

In [85]:
client.delete_by_query(index=my_index, body={"query": {"match_all": {}}})

{'took': 1,
 'timed_out': False,
 'total': 0,
 'deleted': 0,
 'batches': 0,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [86]:
client.indices.refresh(index=my_index)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [87]:
client.count(index=my_index)

{'count': 0,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [88]:
client.indices.refresh(index=my_index)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [89]:
for bill_file_name, bill_text in bills.items():
    client.index(index=my_index,
    id=bill_file_name,
    document={
        "text": bill_text
    })


In [90]:
client.indices.refresh(index=my_index)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [91]:
client.count(index=my_index)

{'count': 1178,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [92]:
print(len(bills))

1178


In [93]:
query = {
    "match": {
      "text": "ustawa"
    }
}


The number of documents with the word "ustawa"

In [94]:
client.search(index=my_index, query=query, track_total_hits=True)['hits']['total']['value']

1178

In [95]:
analyzed_ustawa = client.indices.analyze(index=my_index, body={
    "analyzer": "my_analyzer",
    "text": "ustawa"
})
analyzed_ustawa

{'tokens': [{'token': 'ustawa',
   'start_offset': 0,
   'end_offset': 6,
   'type': '<ALPHANUM>',
   'position': 0}]}

In [96]:
forms = []
for token in analyzed_ustawa["tokens"]:
    forms.append(token["token"])

In [97]:

headers = {
    # Already added when you pass json= but not when you pass data=
    # 'Content-Type': 'application/json',
}

json_data = {
    'fields': [
        'text',
    ],
    'offsets': True,
    'payloads': True,
    'positions': True,
    'term_statistics': True,
}

response = requests.get('http://localhost:9200/teksty-prawne/_termvectors/1994_591.txt?pretty', headers=headers, json=json_data)

term_vectors = response.json()

In [98]:
term_vectors

{'_index': 'teksty-prawne',
 '_type': '_doc',
 '_id': '1994_591.txt',
 '_version': 1,
 'found': True,
 'took': 174,
 'term_vectors': {'text': {'field_statistics': {'sum_doc_freq': 708065,
    'doc_count': 1178,
    'sum_ttf': 5157375},
   'terms': {'000': {'doc_freq': 152,
     'ttf': 1042,
     'term_freq': 14,
     'tokens': [{'position': 112, 'start_offset': 767, 'end_offset': 770},
      {'position': 308, 'start_offset': 2257, 'end_offset': 2260},
      {'position': 10307, 'start_offset': 77452, 'end_offset': 77455},
      {'position': 10308, 'start_offset': 77456, 'end_offset': 77459},
      {'position': 10326, 'start_offset': 77578, 'end_offset': 77581},
      {'position': 10327, 'start_offset': 77582, 'end_offset': 77585},
      {'position': 12528, 'start_offset': 94703, 'end_offset': 94706},
      {'position': 12529, 'start_offset': 94707, 'end_offset': 94710},
      {'position': 12550, 'start_offset': 94845, 'end_offset': 94848},
      {'position': 12551, 'start_offset': 94849

The total number of occurences of the word "ustawa" in any form

In [99]:
form_frequency = [ term_vectors["term_vectors"]['text']["terms"][form]["ttf"] for form in forms]
sum(form_frequency)

24934

In [100]:
analyzed_ustaw = client.indices.analyze(index=my_index, body={
    "analyzer": "my_analyzer",
    "text": "ustaw"
})
analyzed_ustaw

{'tokens': [{'token': 'ustawa',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'ustawić',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0}]}

In [102]:
forms = []
for token in analyzed_ustaw["tokens"]:
    forms.append(token["token"])
forms

['ustawa', 'ustawić']

The total number of occurences of the word "ustaw" in any form

In [103]:
form_frequency = [ term_vectors["term_vectors"]['text']["terms"][form]["ttf"] for form in forms]
sum(form_frequency)

25847

In [104]:
query = {
    "match_phrase": {
      "text": {
        "query": "kodeks postępowania cywilnego",
        "analyzer": "my_analyzer",
        "slop": 0
      }
    }
}


The number of occurences of the phrase "kodeks postępowania cywilnego"

In [105]:
client.search(index=my_index, query=query, track_total_hits=True)['hits']['total']['value']

99

In [106]:
query = {
    "match_phrase": {
      "text": {
        "query": "wchodzi w życie",
        "analyzer": "my_analyzer",
        "slop": 2
      }
    }
}

The number of occurences of the phrase "wchodzi w życie" (każdy dokument ma na końcu informację kiedy wchodzi w życie(?)).

In [107]:
client.search(index=my_index, query=query, track_total_hits=True)['hits']['total']['value']

1174

In [108]:
query = {
    "match": {
      "text": "konstytucja"
    }
}

In [109]:
hits = client.search(index=my_index, query=query, track_total_hits=True)['hits']['hits']

hits_ids = [hit['_id'] for hit in hits]

10 most relevant docs for the word "Konstytucja"

In [110]:
hits_ids

['1997_629.txt',
 '2000_443.txt',
 '1997_604.txt',
 '1996_350.txt',
 '1997_642.txt',
 '2001_23.txt',
 '1996_199.txt',
 '1999_688.txt',
 '2001_1082.txt',
 '1997_681.txt']

In [111]:
import requests

headers = {
    # Already added when you pass json= but not when you pass data=
    # 'Content-Type': 'application/json',
}

json_data = {
    'query': {
        'match': {
            'text': 'konstytucja',
        },
    },
    'highlight': {
        'fields': {
            'text': {},
        },
    },
}



In [112]:
response = requests.get('http://localhost:9200/teksty-prawne/_search?pretty', headers=headers, json=json_data)


In [113]:
with_highlights = response.json()


In [114]:
highlights = {hit['_id']:hit['highlight']['text'] for hit in with_highlights['hits']['hits']}

In [115]:
for doc, hls in highlights.items():
    print(doc)
    print('=============')
    for hl in hls[:3:]:
        print(hl)

1997_629.txt
KONSTYTUCYJNA z dnia 22 kwietnia 1994 r. o zmianie ustawy konstytucyjnej o trybie przygotowania i uchwalenia <em>Konstytucji</em>
Inicjatywa ustawodawcza w zakresie przedstawienia Zgromadzeniu Narodowemu projektu nowej <em>Konstytucji</em>
Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz obywateli popierających zgłoszenie, zawierający
2000_443.txt
Ratyfikacji podlegają umowy międzynarodowe, o których mowa w art. 89 ust. 1 i art. 90 <em>Konstytucji</em> Rzeczypospolitej
umowy międzynarodowej lub załącznika nie wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>
okoliczności, a umowa międzynarodowa nie wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>
1997_604.txt
W razie powstania wątpliwości co do zgodności z <em>Konstytucją</em> celów lub zasad działania partii politycznej
Jeżeli Trybunał Konstytucyjny wyda orzeczenie o sprzeczności celów partii politycznej z <em>Konstytucją</em>,
Rozdział 5 Postęp