In [1]:
!pip install sentence-transformers faiss-cpu openai




In [1]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer


# LOAD FAISS + METADATA + MODEL

print("Loading FAISS index...")
index = faiss.read_index("shl_faiss.index")

print("Loading metadata...")
with open("shl_metadata.pkl", "rb") as f:
    meta = pickle.load(f)

df = meta["df"]
model_name = meta["model_name"]
model = SentenceTransformer(model_name)

print("Loaded model:", model_name)

# Get top-k results from FAISS
def semantic_search(query, top_k=20):
    query_emb = model.encode([query], normalize_embeddings=True)
    distances, indices = index.search(query_emb, top_k)

    results = []
    for idx in indices[0]:
        row = df.iloc[idx].to_dict()
        results.append(row)

    return results



#  LLM Re-ranking 

def rerank_with_llm(query, retrieved_items, top_k=10):
    """
    You can use OpenAI, Gemini, or any LLM.
    I'll provide a clean OpenAI example below.
    """

    try:
        from openai import OpenAI
        client = OpenAI()

        context = "\n\n".join(
            [f"[{i}] {item['name']} — {item['description']}" for i, item in enumerate(retrieved_items)]
        )

        prompt = f"""
        You are a ranking model for SHL assessment recommendations.

        Query:
        {query}

        Below is a list of candidate assessments. Rank them from most relevant to least relevant.

        {context}

        Return ONLY a list of ranked indexes (like: 3,0,1,2,...)
        """

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}]
        )

        ranked_order = response.choices[0].message.content.strip()
        ranked_order = [int(x) for x in ranked_order.split(",")]

        reranked = [retrieved_items[i] for i in ranked_order[:top_k]]
        return reranked

    except Exception as e:
        print("LLM re-ranking failed, using raw results:", e)
        return retrieved_items[:top_k]



# Format results 

def format_output(items):
    formatted = []

    for item in items:
        # Clean duration safely 
        duration_value = item.get("Duration", None)

        try:
            if duration_value is None or str(duration_value).lower() == "nan" or duration_value == "":
                duration_clean = None
            else:
                duration_clean = int(float(duration_value))
        except:
            duration_clean = None

        formatted.append({
            "url": item.get("Link", ""),
            "name": item.get("Assessment Name", ""),
            "description": item.get("Description", ""),
            "duration": duration_clean,
            "adaptive_support": item.get("Adaptive/IRT (Yes/No)", "Unknown"),
            "remote_support": item.get("Remote Support (Yes/No)", "Unknown"),
            "test_type": [item.get("Test Type", "Unknown")]
        })

    return {"recommended_assessments": formatted}


# MAIN PIPELINE FUNCTION TO CALL FROM API

def recommend(query, use_llm=True):
    retrieved = semantic_search(query, top_k=20)

    if use_llm:
        final_items = rerank_with_llm(query, retrieved, top_k=10)
    else:
        final_items = retrieved[:10]

    output = format_output(final_items)
    return output




  import pynvml  # type: ignore[import]


Loading FAISS index...
Loading metadata...
Loaded model: BAAI/bge-small-en-v1.5


In [3]:

# TEST THE SYSTEM
q = "I am hiring a Python developer who works well with teams"
result = recommend(q, use_llm=True)  # turn off LLM for testing
print(result)


LLM re-ranking failed, using raw results: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
{'recommended_assessments': [{'url': 'https://www.shl.com/products/product-catalog/view/python-new/', 'name': 'Python (New)', 'description': 'Multi-choice test that measures the knowledge of Python programming, databases, modules and library.', 'duration': 11, 'adaptive_support': 'No', 'remote_support': 'Yes', 'test_type': ['K']}, {'url': 'https://www.shl.com/products/product-catalog/view/job-control-language-new/', 'name': 'Job Control Language (New)', 'description': 'Multi-choice test that measures the knowledge of JCL libraries, parameters, statements, datasets, generation of data groups and conditional processing.', 'duration': 10, 'adaptive_support': 'No', 'remote_support': 'Yes', 'test_type': ['K']}, {'url': 'https://www.shl.com/products/product-catalog/view/verify-interactive-g-candidate-report/', 'name': 'V