In [1]:
import re
from collections import defaultdict
from itertools import product

import morfeusz2 as morfeusz2
import numpy as np
from tqdm import tqdm

from List_4.utils import prepare_question, scaled_editdist, score_documents

In [2]:
with open("../List_2/outputs/indexed_contents.txt", "r") as f:
    content_lines = f.readlines()

with open("../List_2/outputs/indexed_titles.txt", "r") as f:
    titles_lines = f.readlines()

morph = morfeusz2.Morfeusz()

In [3]:
def get_definition_from_wiki(content):
    i = 1
    sents = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', content)
    definition = str(sents[0])
    while len(definition) < 20 and len(sents) > i + 1:
        i+=1
        definition += str(sents[i])
    return definition

def get_definition_from_question(question):
    definition = []
    beginnings = ["Co oznacza", "Co po polsku oznacza", "Jak miał na imię", "Jak miała na imię", "Jak nazywa się", "Jak w", "Jak z", "Jak brzmi"]
    for beginning in beginnings:
        l = len(beginning)
        if question[:l] == beginning:
            definition = question[l:]
            break
    if len(definition) > 0:
        definition = definition.split()
        definition = [w.lower() for w in definition if not ("oznacza" in w or "nazywa" in w)]
        definition = " ".join(definition)
    return definition

def preprocess_definition(definition, only_first = True):
    definition = definition.strip().strip('"').split()
    definition = [w for w in definition if len(w) > 2 and w.isalpha()]
    current_idx = 0
    current_def = []
    analysis = morph.analyse(" ".join(definition))
    if only_first:
        for i, j, interp in analysis:
            if i == current_idx:
                current_def.append(interp[1].partition(":")[0])
                current_idx += 1
    else:
        for i, j, interp in analysis:
            current_def.append(interp[1].partition(":")[0])
    return " ".join(definition), current_def

def prepare_rare_words(prepared_definitions, min, max):
    word_freq = defaultdict(int)
    for definition in tqdm(prepared_definitions):
        for word in definition[1]:
            word_freq[word] +=1
    rare_words = set([k for k, v in word_freq.items() if min < v < max])
    rare_words_freq = {k:v for k, v in word_freq.items() if min < v < max}
    return rare_words, rare_words_freq

def prepare_idf(definition, rare_words):
    l = len(definition)
    definition_rare_words = defaultdict(int)
    for word in definition:
        if word in rare_words:
            definition_rare_words[word] += 1
    definition_rare_words = {k:(v/l) for k, v in definition_rare_words.items()}
    return definition_rare_words

def similarity(d1, d2):
    score = 0
    norm1, norm2 = 0, 0
    for w, idf in d1.items():
        if w in d2.keys():
            score += d1[w]*d2[w]
        norm1 += idf**2
    for w, idf in d2.items():
        norm2 += idf**2
    return score/(np.sqrt(norm1) * np.sqrt(norm2) + 1e-10)

def find_similar_definitions(definition, definition_features, rare_words):
    scores = {}
    target_features = prepare_idf(definition, rare_words)
    max_words_matched = 1
    for i, features in enumerate(definition_features):
        words_matched = len(set(target_features.keys()) & set(features.keys()))
        if words_matched >= max_words_matched:
            max_words_matched = words_matched
            match = similarity(target_features, features)
            scores[i] = match
    return dict(sorted(scores.items(), key=lambda item: -item[1]))

def answer(definition, titles_lines, titles, definition_features, rare_words,):
    definition = preprocess_definition(definition, only_first=False)[1]
    candidates = list(find_similar_definitions(definition, definition_features, rare_words).items())[:30]
    for idx, score in candidates:
        title = titles_lines[titles[idx]].lower()
        _, res_tokens = prepare_question(title, morph)
        for t1, t2 in product(res_tokens, definition):
            if scaled_editdist(t1, t2) <= 0.4:
                break
        else:
            paren_index = title.find('(')
            if paren_index != -1:
                title = title[:paren_index]
            return title
    return "Nie znam odpowiedzi"

In [4]:
prepared_definitions = []
prepared_titles = []
for i in tqdm(range(len(content_lines))):
    current_def = preprocess_definition(get_definition_from_wiki(content_lines[i]))
    if len(current_def[1]) > 3:
        prepared_definitions.append(current_def)
        prepared_titles.append(i)

100%|██████████| 1208362/1208362 [03:55<00:00, 5128.64it/s]


In [5]:
rare_words, rare_words_freq = prepare_rare_words(prepared_definitions, 1, 5000)

100%|██████████| 1133577/1133577 [00:03<00:00, 334582.36it/s]


In [6]:
definition_features = []
for definition in tqdm(prepared_definitions):
    definition_features.append(prepare_idf(definition[1], rare_words))

100%|██████████| 1133577/1133577 [00:04<00:00, 267551.34it/s]


In [7]:
%%time
scores = find_similar_definitions(preprocess_definition("szybki obrót wykonywany na palcach jednej nogi")[1], definition_features, rare_words)
for i, title in enumerate(scores.keys()):
    print(titles_lines[prepared_titles[title]].strip())
    if i > 5:
        break

Szybki Szmal
Zespół bolesnych nóg i ruchów palców
Prędkość obrotowa
Wahacz
Victoria (gest)
Brzuszki
Piruet
CPU times: user 1.05 s, sys: 4.01 ms, total: 1.05 s
Wall time: 1.05 s


In [8]:
answer("Jak nazywa się wypukła albo wklęsła powierzchnia cieczy w pobliżu ścianek naczynia?", titles_lines,  prepared_titles, definition_features, rare_words)

'menisk \n'

In [25]:
# 21/189
with open('../List_2/data/pytania.txt', 'r') as questions:
    with open('t2_2_odpowiedzi_tylko_wektory.txt', 'w') as f_out:
        line = questions.readline()
        while line:
            definition = get_definition_from_question(line.strip()[:-1])
            if len(definition) > 0:
                a = answer(definition, titles_lines,  prepared_titles, definition_features, rare_words)
                f_out.write(a.strip()+' \n')
            else:
                f_out.write("Nie o definicję" +' \n')
            line = questions.readline()

In [23]:
def answer_mod(question, weights, threshold, titles, titles_lines, rare_words, definition_features):
    yes_or_no = question.split()[0] == "Czy"
    definition = get_definition_from_question(question)
    if len(definition) > 0:
        candidates = list(find_similar_definitions(preprocess_definition(definition, only_first=False)[1], definition_features, rare_words).items())[:100]
    else:
        candidates = []
    candidates = [titles[idx[0]] for idx in candidates]
    _, question = prepare_question(question, morph)
    question = [token.lower() for token in question if len(token) > 1]
    while question:
        query = ' '.join(q for q in question)
        results = score_documents(query, weights, morph, candidates)
        search_results = list(results.keys())[:10]
        search_scores = list(results.values())[:10]
        search_results = zip(search_results, search_scores)
        for result, score in search_results:
            if yes_or_no:
                if score / len(question) < threshold:
                    return "nie"
                else:
                    return "tak"
            title = titles_lines[result]
            _, res_tokens = prepare_question(title, morph)
            for t1, t2 in product(res_tokens, question):
                if scaled_editdist(t1, t2) <= 0.3:
                    break
            else:
                paren_index = title.find('(')
                if paren_index != -1:
                    title = title[:paren_index]
                return title
        # if answer not found, remove first token of query
        del question[0]
    return 'nie mam pojęcia, sorry \n'

In [24]:
# 82/1000
with open('../List_2/data/pytania.txt', 'r') as questions:
    with open('t2_2_odpowiedzi_mieszane1.txt', 'w') as f_out:
        line = questions.readline()
        while line:
            a = answer_mod(line.strip()[:-1], weights=[1, 1, 0.3, 0.3, 2], threshold=0.5, titles_lines=titles_lines,  titles=prepared_titles, definition_features=definition_features, rare_words=rare_words)
            print(a.strip())
            f_out.write(a.strip()+' \n')
            line = questions.readline()

Alfa
Cięciwa
Nelly Rawlison
tak
Śmierć Kliniczna
Trynidad i Tobago
Tomasz Czereśniak
W pustyni i w puszczy
tak
The Cellar Door Sessions 1970
Władca zwierząt
Reduktor
XXXIX Liceum Ogólnokształcące im. Lotnictwa Polskiego w Warszawie
Dębowa Kłoda
Dekantacja
David Brewster
Isztadewata
Nawanialnia
tak
AWK
Początek
Skąd przyszliśmy? Kim jesteśmy? Dokąd idziemy?
Giovanni Bertuccio
Feblik
Równoleżnik
Browar Karscha
Distributed.net
Ligia
Pomnik Żołnierza Polskiego w Grudziądzu
Epiderma
Baracoa
James Bond
Zabytki w Pabianicach
nie mam pojęcia, sorry
Dzielny pies Rusty
tak
Piotr Kąkolewski
Lucio Wagner
nie mam pojęcia, sorry
Staw Staszica
Tragedia Makbeta
Góry Kii
nie mam pojęcia, sorry
Romanioci
tak
Krtań
Krótki film o miłości
nie mam pojęcia, sorry
Gnieciuch
Efekt okopowy
nie mam pojęcia, sorry
Wykiwać klawisza
tak
Powiat Ostvorpommern
Amerykanie pochodzenia wenezuelskiego
Kazimierz Badowski
Eudoksja
Barbie jako księżniczka i żebraczka
Izokrates
tak
Tonaż
tak
Alfred Stark
Coco Island
nie mam p

In [None]:
# https://github.com/sdadas/polish-nlp-resources