In [1]:
import json

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from job_search_application.minsearch import Index

In [2]:
with open('../data/retrival_evaluation_data.json', 'r') as json_file:
    evaluation_data = json.load(json_file)

In [5]:
with open('../data/job_data.json', 'r') as json_file:
    job_data = json.load(json_file)

In [6]:
index = Index(
    text_fields=[
        "title",
        "company",
        "locations",
        "skills",
        "posted_at",
        "is_remote",
        "snippet_fragments",
        "description"
    ],
    keyword_fields=["id"]
)

index.fit(job_data)

<job_search_application.minsearch.Index at 0x7f22de3419a0>

In [11]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [18]:
# test search
query = "I am looking for job as a CRM Lead in Berlin."
search_results = search(query)

for hit in search_results:
    print(hit['id'], hit['title'])

2 CRM Lead (m/w/d)
15 CoE Lead Manufacturing & Quality (m/w/d)
10 Sales Team Leader B2B for Italy - Circular Tech IT Infrastructure (m/w/d)
11 Revenue Strategy Lead Analyst
4 Process Integration Lead


In [9]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in ground_truth:
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
        # print(relevance)
        # print()

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
evaluate(eval_data, lambda q: search(q['question']))

{'hit_rate': 0.8941176470588236, 'mrr': 0.8637254901960784}

In [25]:
import random

def simple_optimize(param_ranges, objective_function, val_data, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params, val_data)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [26]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [27]:
param_ranges = {
    "title": (0.0, 3.0),
    "company": (0.0, 3.0),
    "locations": (0.0, 3.0),
    "skills": (0.0, 3.0),
    "posted_at": (0.0, 3.0),
    "is_remote": (0.0, 3.0),
    "snippet_fragments": (0.0, 3.0),
    "description": (0.0, 3.0),
}

def objective(boost_params, val_data):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(val_data, search_function)
    return results['mrr']

In [31]:
# TODO: random?
validation_data = eval_data[:8*5]
test_data = eval_data[5*8:]

In [29]:
simple_optimize(param_ranges, objective, validation_data, n_iterations=20)

({'title': 1.952132332727948,
  'company': 0.0377436020218489,
  'locations': 0.6504991790099254,
  'skills': 1.1721488115657142,
  'posted_at': 2.920993194746707,
  'is_remote': 0.6723138242171246,
  'snippet_fragments': 2.596607550050458,
  'description': 2.9650809331030823},
 0.95)

In [30]:
def minsearch_improved(query):
    boost = {
      'title': 1.95,
      'company': 0.04,
      'locations': 0.65,
      'skills': 1.17,
      'posted_at': 2.92,
      'is_remote': 0.676,
      'snippet_fragments': 2.6,
      'description': 2.97
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=3
    )

    return results

In [32]:
evaluate(validation_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.95, 'mrr': 0.95}

In [33]:
# TODO: dataset too small
evaluate(test_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.7333333333333333, 'mrr': 0.7}

In [34]:
evaluate(eval_data, lambda q: minsearch_improved(q['question']))

{'hit_rate': 0.8352941176470589, 'mrr': 0.8176470588235294}