In [5]:
import json

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from job_search_application.minsearch import Index

In [12]:
with open('../data/retrival_evaluation_data.json', 'r') as json_file:
    eval_data = json.load(json_file)

In [13]:
len(eval_data)

830

In [14]:
with open('../data/job_data.json', 'r') as json_file:
    job_data = json.load(json_file)

In [15]:
len(job_data)

167

In [16]:
index = Index(
    text_fields=[
        "title",
        "company",
        "locations",
        "skills",
        "posted_at",
        "is_remote",
        "snippet_fragments",
        "description"
    ],
    keyword_fields=["id"]
)

index.fit(job_data)

<job_search_application.minsearch.Index at 0x7fc9a7fae090>

In [17]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [18]:
# test search
query = "I am looking for job as a CRM Lead in Berlin."
search_results = search(query)

for hit in search_results:
    print(hit['id'], hit['title'])

2 CRM Lead (m/w/d)
141 Senior/Expert Data Privacy Engineer (f/m/d): Data Privacy-Preserving AI
81 Senior/Expert Machine Learning Engineer (f/m/d): Data Privacy-Preserving AI


In [19]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in ground_truth:
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [20]:
evaluate(eval_data, lambda q: search(q['question']))

{'hit_rate': 0.6614457831325301, 'mrr': 0.5740963855421684}

In [21]:
import random

def simple_optimize(param_ranges, objective_function, val_data, n_iterations=5):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params, val_data)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [22]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [23]:
param_ranges = {
    "title": (0.0, 3.0),
    "company": (0.0, 3.0),
    "locations": (0.0, 3.0),
    "skills": (0.0, 3.0),
    "posted_at": (0.0, 3.0),
    "is_remote": (0.0, 3.0),
    "snippet_fragments": (0.0, 3.0),
    "description": (0.0, 3.0),
}

def objective(boost_params, val_data):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(val_data, search_function)
    return results['mrr']

In [24]:
# TODO: random?
validation_data = eval_data[:80*5]
test_data = eval_data[5*80:]

In [25]:
simple_optimize(param_ranges, objective, validation_data, n_iterations=20)

({'title': 1.2381302316564724,
  'company': 1.1893658795325162,
  'locations': 2.912048276332455,
  'skills': 1.0518698887822846,
  'posted_at': 2.08800533465675,
  'is_remote': 0.8273411296050296,
  'snippet_fragments': 0.5565008057082038,
  'description': 2.197036708073208},
 0.6758333333333337)

In [26]:
def minsearch_improved(query):
    boost = {
      'title': 1.24,
      'company': 1.2,
      'locations': 2.91,
      'skills': 1.05,
      'posted_at': 2.09,
      'is_remote': 0.83,
      'snippet_fragments': 0.56,
      'description': 2.18
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [27]:
evaluate(validation_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.76, 'mrr': 0.6762500000000002}

In [28]:
# TODO: dataset too small
evaluate(test_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.586046511627907, 'mrr': 0.4934108527131784}

In [29]:
evaluate(eval_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.6698795180722892, 'mrr': 0.5815261044176704}