In [1]:
from src.search_engine import SearchEngine
from src.model import Model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [2]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [3]:
def get_best_using_sparse(query, candidates, vectorizer):
    tfidf_vectors = vectorizer.transform(candidates).toarray()
    tfidf_query = vectorizer.transform([query]).toarray()
    best_dist = -99999999999
    best_idx = -1 
    for i in range(tfidf_vectors.shape[0]):
        # print(tfidf_query.shape, tfidf_vectors.shape)
        sim = cosine_sim(np.ravel(tfidf_query), np.ravel(tfidf_vectors[i]))
        if sim > best_dist:
            best_dist = sim
            best_idx = i
    return best_idx

In [4]:
preprocessed_data_dir = "data/preprocessed_corpus.json"
embeddings_dir = "data/corpus_embeddings.json"

In [5]:
model = Model("distiluse-base-multilingual-cased-v1")
search = SearchEngine(embeddings_dir, preprocessed_data_dir, model)

In [6]:
known_definitions = [text.split(".")[0] for text in search.text]
len(known_definitions)

100848

In [7]:

vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(search.text)

In [54]:
all_questions = []

# Dataset A
with open("data/task2_questions_with_answers.tsv", encoding="utf-8") as f:
    for line in f.readlines():
        line = line.strip().split("\t")
        question, answer = line[0], line[1:]
        all_questions.append((question, answer))

# Dataset B
# with open("data/test_B_questions.txt", encoding="utf-8") as questions_source:
#     with open("data/test_B_answers.txt", encoding="utf-8") as ans_source:
#         for question, answers in zip(questions_source.readlines(), ans_source.readlines()):
#             answers = answers.strip().split("\t")
#             all_questions.append((question.strip(), answers))

In [55]:
# Dataset A'
filtered_questions = []
for i, question in enumerate(all_questions):
    if i % 5 == 0:
        filtered_questions.append(question)
all_questions = filtered_questions

In [56]:
dense_correct = 0
dense_heur_correct = 0
dense_sparse_correct = 0
dense_sparse_heur_correct = 0
dense_answers = []
dense_heur_answers = []
dense_sparse_answers = []
dense_sparse_heur_answers = []
for query, answers in tqdm(all_questions):
    yes_no_question = query[:4] == "Czy "
    # print("Query: {}, Answers: {}".format(query, answers))
    dense_result = search.run(query)[0]
    result_word = dense_result.split(".")[0]
    dense_correct += int(result_word.lower() in answers)
    dense_answers.append(result_word)
    if yes_no_question:
        heur_answer = "tak"
    else:
        heur_answer = result_word
    dense_heur_correct += int(heur_answer.lower() in answers)
    dense_heur_answers.append(heur_answer)

    # print("Best text using dense rep: \n\t{}".format(dense_result), end="")
    candidates = search.run(query, k_best=20)
    idx_result = get_best_using_sparse(query, candidates, vectorizer)

    # print(idx_result)
    result = candidates[idx_result]
    word_result = result.split(".")[0]
    dense_sparse_correct += int(word_result.lower() in answers)
    dense_sparse_answers.append(word_result)
    # print("Best text using dense and sparse rep:\n\t{}".format(result))
    
    if yes_no_question:
        heur_answer = "tak"
    else:
        heur_answer = word_result
    dense_sparse_heur_correct += int(heur_answer.lower() in answers)
    dense_sparse_heur_answers.append(heur_answer)

100%|██████████| 700/700 [04:56<00:00,  2.36it/s]


In [57]:
n = len(all_questions)

In [58]:
print("DENSE:\n Correct answers: {} ({:.2f}%)".format(dense_correct, dense_correct * 100/ n))
print("DENSE + SPARSE:\n Correct answers: {} ({:.2f}%)".format(dense_sparse_correct, dense_sparse_correct * 100/ n))
print("DENSE + HEUR:\n Correct answers: {} ({:.2f}%)".format(dense_heur_correct, dense_heur_correct * 100/ n))
print("DENSE + SPARSE + HEUR:\n Correct answers: {} ({:.2f}%)".format(dense_sparse_heur_correct, dense_sparse_heur_correct * 100/ n))

DENSE:
 Correct answers: 21 (3.00%)
DENSE + SPARSE:
 Correct answers: 23 (3.29%)
DENSE + HEUR:
 Correct answers: 57 (8.14%)
DENSE + SPARSE + HEUR:
 Correct answers: 59 (8.43%)


In [60]:

with open("answers_A_prim.txt", "w+", encoding="utf-8") as f:
    for answer in dense_sparse_heur_answers:
        f.write(answer + '\n')