# NLP - lab2

### Mateusz Praski

---


In [1]:
import requests
import json
import sys

import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from glob import glob
from tqdm import trange, tqdm
from sklearn.metrics import ndcg_score

# Task 3 & 4
Define an ES analyzer for Polish texts containing:
- standard tokenizer
- synonym filter with alternative forms for months, e.g. wrzesień, wrz, IX.
- lowercase filter
- Morfologik-based lemmatizer
- lowercase filter (looks strange, but Morfologi produces capitalized base forms for proper names, so we have to lowercase them once more).

Define another analyzer for Polish, without the synonym filter.

In [2]:
analyzer_with_synonyms = {
    'type': 'custom',
    'tokenizer': 'standard',
    'filter': [
        'months-synonyms',
        'lowercase',
        'morfologik_stem',
        'lowercase'
    ]
}

analyzer_without_synonyms = {
    'type': 'custom',
    'tokenizer': 'standard',
    'filter': [
        'lowercase',
        'morfologik_stem',
        'lowercase'
    ]
}

no_lemma_synonyms = {
    'type': 'custom',
    'tokenizer': 'standard',
    'filter': [
        'months-synonyms',
        'lowercase',
    ]
}

no_lemma_no_synonyms = {
    'type': 'custom',
    'tokenizer': 'standard',
    'filter': [
        'lowercase',
    ]
}

filters = {
    'months-synonyms': {
        'type': 'synonym',
        'synonyms': [
            'sty, I => styczeń',
            'lut, II => luty',
            'mar, III => marzec',
            'kwi, IV => kwiecień',
            'V => maj',
            'cze, VI => czerwiec',
            'lip, VII => lipca',
            'sie, VIII => sierpnia',
            'wrz, IX => wrzesień',
            'paz, X => pażdziernik',
            'lis, XI => listopad',
            'gru, XII => grudzień'
        ]
    }
}

# Task 5

Define an ES index for storing the contents of the corpus from lab 1 using both analyzers. Use different names for the fields analyzed with a different pipeline.

In [3]:
index_definition = {
    'mappings': {
        'properties': {
            'answer': {
                'type': 'text',
                'fields': {
                    'with_synonyms': {
                        'type': 'text',
                        'analyzer': 'analyze_with_synonyms'
                    },
                    'without_synonyms': {
                        'type': 'text',
                        'analyzer': 'analyze_without_synonyms'
                    }
                }
            },
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'analyze_with_synonyms': analyzer_with_synonyms,
                'analyze_without_synonyms': analyzer_without_synonyms,
                'analyze_no_lemma_synonyms': no_lemma_synonyms,
                'analyze_no_lemma_no_synonyms': no_lemma_no_synonyms
            },
            'filter': filters
        }
    }
}

In [4]:
body = json.dumps(index_definition, indent=4)
print(body)

{
    "mappings": {
        "properties": {
            "answer": {
                "type": "text",
                "fields": {
                    "with_synonyms": {
                        "type": "text",
                        "analyzer": "analyze_with_synonyms"
                    },
                    "without_synonyms": {
                        "type": "text",
                        "analyzer": "analyze_without_synonyms"
                    }
                }
            }
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "analyze_with_synonyms": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "months-synonyms",
                        "lowercase",
                        "morfologik_stem",
                        "lowercase"
                    ]
                },
                "analyze_without_synonyms": {
                    "typ

In [5]:
!docker cp elastic_container:/usr/share/elasticsearch/config/certs/http_ca.crt .

[H[2J[?25l[?7l[0m[31m[1m                     ./[0m[35m[1mo[0m[34m[1m.
[0m[31m[1m                   ./[0m[35m[1msssso[0m[34m[1m-
[0m[31m[1m                 `:[0m[35m[1mosssssss+[0m[34m[1m-
[0m[31m[1m               `:+[0m[35m[1msssssssssso[0m[34m[1m/.
[0m[31m[1m             `-/o[0m[35m[1mssssssssssssso[0m[34m[1m/.
[0m[31m[1m           `-/+[0m[35m[1msssssssssssssssso[0m[34m[1m+:`
[0m[31m[1m         `-:/+[0m[35m[1msssssssssssssssssso[0m[34m[1m+/.
[0m[31m[1m       `.://o[0m[35m[1msssssssssssssssssssso[0m[34m[1m++-
[0m[31m[1m      .://+[0m[35m[1mssssssssssssssssssssssso[0m[34m[1m++:
[0m[31m[1m    .:///o[0m[35m[1mssssssssssssssssssssssssso[0m[34m[1m++:
[0m[31m[1m  `:////[0m[35m[1mssssssssssssssssssssssssssso[0m[34m[1m+++.
[0m[31m[1m`-////+[0m[35m[1mssssssssssssssssssssssssssso[0m[34m[1m++++-
[0m[31m[1m `..-+[0m[35m[1moosssssssssssssssssssssssso[0m[34m[1m+++++/`

In [2]:
ELASTIC_IP = 'https://localhost:9200'
INDEX = 'nlp-index'
auth = ('elastic', '"qwerty"')
cert = 'http_ca.crt'

In [7]:
res = requests.delete(f'{ELASTIC_IP}/{INDEX}', auth=auth, verify=cert)
res.status_code, print(res.content.decode())

{"acknowledged":true}


(200, None)

In [8]:
res = requests.put(f'{ELASTIC_IP}/{INDEX}', json=index_definition, auth=auth, verify=cert)
res.status_code, print(res.content.decode())

{"acknowledged":true,"shards_acknowledged":true,"index":"nlp-index"}


(200, None)

In [5]:
df = pd.read_json("../../data/corpus.jsonl", lines=True)
df = df.set_index('_id').sort_index()
df.head()

Unnamed: 0_level_0,title,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
59,,Samsung stworzył LCD i inne technologie płaski...,{}
63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


# Task 6

Load the data to the ES index.

In [10]:
def bulk_insert(rows, col):
    n = len(rows)
    request_command = [f'{{"index": {{ "_index": "{INDEX}" }} }}'] * n
    request_data = [f'{{"with_synonyms": {text}, "without_synonyms": {text}}}' for text in rows]

    payload = [None] * (n * 2)
    payload[::2] = request_command
    payload[1::2] = request_data
    body = "\n".join(payload) + '\n'
    print(body)
    # Didn't work  in the end :(
    res = requests.post(f'{ELASTIC_IP}/_bulk?pretty', data=body, headers={'Content-type': 'application/x-ndjson'}, auth=auth, verify=cert)
    return res

In [24]:
def standard_insert(df, disable_tqdm=False):
    for index, row in tqdm(df.iterrows(), disable=disable_tqdm, total=len(df.index)):
        rs = requests.put(f"{ELASTIC_IP}/{INDEX}/_doc/{index}", json={"answer": row['text']}, auth=auth, verify=cert)
        if rs.status_code != 201 and rs.status_code != 200:
            raise RuntimeError(f"{rs.status_code} - {rs.text}")

In [25]:
# res = requests.post(f"{ELASTIC_IP}/{INDEX}/_doc", json={"text": text}, auth=auth, verify=cert)

In [26]:
# res.status_code

In [27]:
standard_insert(df)

100%|██████████| 57638/57638 [12:39<00:00, 75.94it/s]


In [28]:
requests.get(f"{ELASTIC_IP}/{INDEX}/_count", auth=auth, verify=cert).text

'{"count":57638,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0}}'

In [32]:
res = requests.get(f"{ELASTIC_IP}/{INDEX}/_search?pretty&size=5", auth=auth, verify=cert, json={
    "query": {
        "multi_match": {
            "query": df.iloc[0]['text'],
            "fields": [
                "answer.with_synonyms"
            ]
        }
    }
})

In [3]:
def query_text(text, analyzer=None, limit=None, use_synonyms=True):
    url = f"{ELASTIC_IP}/{INDEX}/_search"
    if limit is not None:
        url += f"?size={limit}"

    index = "answer.with_synonyms" if use_synonyms else "answer.without_synonyms"

    body  = {
        "query": {
            "match": {
                index: {
                    "query": text,
                }
            }
        }
    }

    if analyzer is not None:
        body["query"]["match"][index]["analyzer"]  = analyzer

    rs = requests.get(url, json=body, auth=auth, verify=cert)
    if rs.status_code != 200:
        raise RuntimeError(f"{rs.status_code} - {rs.text}")
    return json.loads(rs.text)

# Task 7

Determine the number of documents containing the word `styczeń` (in any form) including and excluding the synonyms.

## With synonyms

In [7]:
query_text("styczeń")['hits']['total']['value']

3101

## Without synonyms

In [8]:
query_text("styczeń", use_synonyms=False)['hits']['total']['value']

329

It's worth mentioning that synonym for `styczeń` is  `I`, which may occur in some other texts

In [9]:
query_text("styczeń")['hits']['hits'][0]

{'_index': 'nlp-index',
 '_id': '71552',
 '_score': 5.8238015,
 '_source': {'answer': 'Niech P oznacza kwotę inwestycji, R stopę zwrotu, a I stopę inflacji. Dla uproszczenia załóżmy, że płatność p jest dokonywana corocznie zaraz po uzyskaniu zwrotu. Tak więc, na koniec roku, inwestycja P wzrosła do P*(1+R), a p jest zwracane jako wypłata renty. Jeżeli I = 0, cały zwrot może zostać wypłacony jako zapłata, a więc p = P*R. Oznacza to, że pod koniec roku, gdy kurz opadnie po odebraniu zwrotu P*R i wypłaceniu go jako renty dożywotniej, P jest ponownie dostępne na początku następnego roku, aby zarobić zwrot według stawki R. My mieć P*(1+R) - p = P Jeżeli I > 0, to na koniec roku, po opadnięciu kurzu, nie możemy sobie pozwolić na posiadanie tylko P jako inwestycji na przyszły rok. Przyszłoroczna opłata musi wynosić p*(1+I), więc potrzebujemy większej inwestycji, ponieważ stopa zwrotu jest stała. O ile większy? Cóż, jeśli inwestycja na początku przyszłego roku wyniesie P*(1+I), zarobi dokładni

# Task 9
Compute NDCG@5 for the QA dataset (the test subset) for the following setups
- synonyms enabled and disabled,
- lemmatization in the query enabled and disabled.

In [4]:
questions = pd.read_json('../../data/queries.jsonl', lines=True)
questions.head()

Unnamed: 0,_id,text,metadata
0,0,Co jest uważane za wydatek służbowy w podróży ...,{}
1,4,Wydatki służbowe - ubezpieczenie samochodu pod...,{}
2,5,Rozpoczęcie nowego biznesu online,{}
3,6,„Dzień roboczy” i „termin płatności” rachunków,{}
4,7,Nowy właściciel firmy – Jak działają podatki d...,{}


In [6]:
qa = pd.concat([
    pd.read_csv(path, sep='\t')
    for path in glob('../../data/*.tsv')
])
qa = qa.sort_values(by='query-id')
qa.head()

Unnamed: 0,query-id,corpus-id,score
0,0,18850,1
0,1,14255,1
1,2,308938,1
2,3,296717,1
3,3,100764,1


In [7]:
qa['score'].nunique()

1

In [8]:
qa_mapping = csr_matrix(
    (qa['score'], (qa['query-id'], qa['corpus-id'])),
    shape=(qa['query-id'].max() + 1, df.index.max() + 1),
    dtype=int
)

In [9]:
qa_mapping[1, 2]

0

In [10]:
max_matches = qa.groupby('query-id')['corpus-id'].count().rename('count')
max_matches

query-id
0        1
1        1
2        1
3        4
4        1
        ..
11092    6
11096    4
11097    1
11099    2
11104    1
Name: count, Length: 6648, dtype: int64

In [11]:
def eval_answers(questions, analyzer, use_syonynms):
    no_questions = len(questions.index)
    rec = np.empty((no_questions, 5), dtype=int)

    for index, row in tqdm(questions.iterrows(), total=no_questions):
        rs = query_text(row['text'], analyzer=analyzer, limit=5, use_synonyms=use_syonynms)

        recs = [qa_mapping[int(row['_id']), int(rs['_id'])] for rs in rs['hits']['hits'][:5]]
        if len(recs) < 5:
            recs += [-1] * (5 - len(recs))

        rec[index] = recs
    return rec

In [12]:
rec_lemma_synonyms = eval_answers(questions, 'analyze_with_synonyms', use_syonynms=True)

100%|██████████| 6648/6648 [01:55<00:00, 57.67it/s]


In [13]:
rec_lemma_no_synonyms = eval_answers(questions, 'analyze_without_synonyms', use_syonynms=False)

100%|██████████| 6648/6648 [01:56<00:00, 57.08it/s]


In [14]:
rec_no_lemma_synonyms = eval_answers(questions, 'analyze_no_lemma_synonyms', use_syonynms=True)

100%|██████████| 6648/6648 [01:37<00:00, 68.12it/s]


In [15]:
rec_no_lemma_no_synonyms = eval_answers(questions, 'analyze_no_lemma_no_synonyms', use_syonynms=False)

100%|██████████| 6648/6648 [01:34<00:00, 70.15it/s]


In [16]:
perfect_answers = np.zeros((len(questions.index), 5), dtype=int)

for index, row in tqdm(questions.iterrows()):
    matches = min(max_matches.loc[row['_id']], 5)
    vector = ([1] * matches) + ([0] * (5 - matches))
    perfect_answers[index, :] = vector

6648it [00:00, 30445.81it/s]


In [18]:
np.savez(
    'rs/scores.npz',
    no_lemma_no_synonyms=rec_no_lemma_no_synonyms,
    no_lemma_synonyms=rec_no_lemma_synonyms,
    lemma_no_synonyms=rec_lemma_no_synonyms,
    lemma_synonyms=rec_lemma_synonyms
)

In [22]:
pd.DataFrame(data={
    "lemmatization": [
        ndcg_score(perfect_answers, rec_lemma_synonyms),
        ndcg_score(perfect_answers, rec_lemma_no_synonyms)
    ],
    "no_lemmatization": [
        ndcg_score(perfect_answers, rec_no_lemma_synonyms),
        ndcg_score(perfect_answers, rec_no_lemma_no_synonyms)
    ]},
    index=["synonyms", "no_synonyms"]
).applymap(lambda x: f"{x:.2%}")

Unnamed: 0,lemmatization,no_lemmatization
synonyms,77.08%,75.34%
no_synonyms,77.09%,75.34%
