In [1]:
import json
import numpy as np

with open('samples.json', 'r', encoding='utf-8') as f:
    samples = json.load(f)

In [2]:
samples.keys()

dict_keys(['нагрузочное тестирование', 'мотивация', 'редактор кода', 'конференция для разработчиков', 'олимпиада по программированию', 'методы кластеризации', 'удалённая работа'])

Use dumped search, because multiprocessing requires main block for correct work

In [3]:
import search
import logging

search_logger = logging.getLogger('search')
search_logger.setLevel(logging.WARNING)

search.build_search()
scorer = search.scorer

In [4]:
def apk(actual: np.array, predicted: np.array, k: int = 10) -> float:
    actual = set(actual)
    pk_sum = 0
    cur_sum = 0
    for i in range(k):
        if predicted[i] in actual:
            cur_sum += 1
            pk_sum += cur_sum / (i + 1)
    return pk_sum / k

In [5]:
from typing import Dict, Tuple


def query_score(query: str, weights: Dict[str, float]) -> float:
    scorer.score_weights = weights
    article_indices = search.retrieve_indices(query)
    scored = [(search.articles[ind].id, scorer.score(query, ind)) for ind in article_indices]
    scored = sorted(scored, key=lambda doc: -doc[1])
    predicted = [article_id for article_id, score in scored]
    actual = samples[query]
    return apk(actual, predicted, 10)


def total_score(weights: Dict[str, float]) -> float:
    return sum(query_score(query, weights) for query in samples) / len(samples)

In [6]:
from itertools import product


def weight_list_to_dict(weight_list: Tuple[float]) -> Dict[str, float]:
    return {'word2vec_text': weight_list[0],
            'tfidf_text': weight_list[1],
            'word2vec_title': weight_list[2],
            'tfidf_title': weight_list[3]
            }


best_weights = {}
best_score = 0
for weight_list in product(np.arange(0.1, 1, 0.3), repeat=4):
    weights = weight_list_to_dict(weight_list)
    score = total_score(weights)
    if score > best_score:
        best_score = score
        best_weights = weights

In [7]:
best_weights, best_score

({'word2vec_text': 0.1,
  'tfidf_text': 0.7000000000000001,
  'word2vec_title': 0.1,
  'tfidf_title': 0.4},
 0.5046201814058958)

In [8]:
best_weights = {}
best_score = 0
for weight_list in product([0.05, 0.1, 0.2], [0.5, 0.7, 0.8, 0.9], [0.01, 0.05, 0.1], [0.3, 0.4, 0.5]):
    weights = weight_list_to_dict(weight_list)
    score = total_score(weights)
    if score > best_score:
        best_score = score
        best_weights = weights

In [9]:
best_weights, best_score

({'word2vec_text': 0.1,
  'tfidf_text': 0.9,
  'word2vec_title': 0.05,
  'tfidf_title': 0.3},
 0.508015873015873)