!pip install selenium pandas webdriver-manager


Scraping the Data From the SHL website using Selenium 

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open catalog
driver.get("https://www.shl.com/solutions/products/product-catalog/")
time.sleep(5)

# Grab all <tr> rows in the main catalog table
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")[1:]  # Skipping header

catalog_links = []

for row in rows:
    try:
        cells = row.find_elements(By.TAG_NAME, "td")

        link_elem = row.find_element(By.TAG_NAME, "a")
        name = link_elem.text.strip()
        url = link_elem.get_attribute("href")

        # Check green dot by class ".catalogue__circle.-yes"
        try:
            cells[1].find_element(By.CSS_SELECTOR, ".catalogue__circle.-yes")
            remote_support = "Yes"
        except:
            remote_support = "No"

        try:
            cells[2].find_element(By.CSS_SELECTOR, ".catalogue__circle.-yes")
            adaptive_support = "Yes"
        except:
            adaptive_support = "No"

        # Get test type
        test_type = ", ".join([tt.text.strip() for tt in cells[3].find_elements(By.TAG_NAME, "span")])

        catalog_links.append({
            "name": name,
            "url": url,
            "remote_support": remote_support,
            "adaptive_support": adaptive_support,
            "test_type": test_type
        })

    except Exception as e:
        print("⚠️ Error processing row:", e)
        continue



print(f"✅ Found {len(catalog_links)} valid assessment entries.")

# Save to CSV
df = pd.DataFrame(catalog_links)
df.to_csv("shl_assessments.csv", index=False)
print("✅ Saved to shl_assessments.csv")

driver.quit()


⚠️ Error processing row: Message: no such element: Unable to locate element: {"method":"tag name","selector":"a"}
  (Session info: chrome=135.0.7049.96); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104ffee10 cxxbridge1$str$ptr + 2817040
1   chromedriver                        0x0000000104ff70ac cxxbridge1$str$ptr + 2784940
2   chromedriver                        0x0000000104b3e8d8 cxxbridge1$string$len + 93028
3   chromedriver                        0x0000000104b856a0 cxxbridge1$string$len + 383276
4   chromedriver                        0x0000000104b7ad3c cxxbridge1$string$len + 339912
5   chromedriver                        0x0000000104bc67b8 cxxbridge1$string$len + 649796
6   chromedriver                        0x0000000104b79a80 cxxbridge1$string$len + 335116
7   chromedriver                        0x0000000104fc3c98 cxxbr

Now we will Do Embedding

In [7]:

import subprocess
subprocess.run(["pip", "install", "google-generativeai"])

Note: you may need to restart the kernel to use updated packages.


In [8]:
import google.generativeai as genai
genai.configure(api_key="your api key ")
#api key

In [10]:
def get_embedding(text):
    response = genai.embed_content(
        model="models/embedding-001",
        content=text,
        task_type="RETRIEVAL_DOCUMENT"
    )
    return response['embedding']



In [17]:
import pandas as pd

df = pd.read_csv("shl_assessments.csv")  # or your actual CSV name
df.head()


Unnamed: 0,name,url,remote_support,adaptive_support,test_type
0,Account Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,"C, P, A, B"
1,Administrative Professional - Short Form,https://www.shl.com/solutions/products/product...,Yes,Yes,"A, K, P"
2,Agency Manager Solution,https://www.shl.com/solutions/products/product...,Yes,Yes,"A, B, P, S"
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,"B, P"
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes,No,"B, P"


In [18]:
df["full_text"] = (
    df["name"].fillna("") +
    ". Remote Support: " + df["remote_support"].fillna("") +
    ". Adaptive Support: " + df["adaptive_support"].fillna("") +
    ". Type: " + df["test_type"].fillna("")
)


In [21]:
from tqdm import tqdm
tqdm.pandas()

df["embedding"] = df["full_text"].progress_apply(get_embedding)


100%|███████████████████████████████████████████| 24/24 [00:12<00:00,  1.97it/s]


In [22]:
import pickle

with open("shl_assessments_with_embeddings.pkl", "wb") as f:
    pickle.dump(df, f)



✅ Embeddings created and saved.


In [28]:
import pandas as pd

# Specify the path to your pickle file
pickle_file_path = "shl_assessments_with_embeddings.pkl"

# Load the pickle file into a DataFrame
df = pd.read_pickle(pickle_file_path)

# Display the DataFrame
print(df.head())


                                       name  \
0                  Account Manager Solution   
1  Administrative Professional - Short Form   
2                   Agency Manager Solution   
3   Apprentice + 8.0 Job Focused Assessment   
4     Apprentice 8.0 Job Focused Assessment   

                                                 url remote_support  \
0  https://www.shl.com/solutions/products/product...            Yes   
1  https://www.shl.com/solutions/products/product...            Yes   
2  https://www.shl.com/solutions/products/product...            Yes   
3  https://www.shl.com/solutions/products/product...            Yes   
4  https://www.shl.com/solutions/products/product...            Yes   

  adaptive_support   test_type  \
0              Yes  C, P, A, B   
1              Yes     A, K, P   
2              Yes  A, B, P, S   
3               No        B, P   
4               No        B, P   

                                           full_text  \
0  Account Manager Solution. 

We Start BUIlding Recommendation system

In [40]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from .pkl file
with open("shl_assessments_with_embeddings.pkl", "rb") as f:
    df = pickle.load(f)

# Convert the 'embedding' column into a matrix
embeddings = np.vstack(df["embedding"].values)  # shape: (n, 768)

# Gemini embedding function (yours should already be working)
def get_embedding(text: str):
    response = genai.embed_content(
        model="models/embedding-001",
        content=text,
        task_type="RETRIEVAL_DOCUMENT"
    )
    return response['embedding']

# Final recommendation function
def recommend_assessments(query: str, top_k: int = 10):
    query_vector = np.array(get_embedding(query)).reshape(1, -1)
    similarities = cosine_similarity(query_vector, embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        row = df.iloc[idx]
        results.append({
            "assessment_name": row["name"],
            "url": row["url"],
            "remote_testing": row["remote_support"],      # corrected key
            "adaptive_support": row["adaptive_support"],
            "duration": "Not provided",                    # optional: add if you later extract it
            "test_type": row["test_type"],
            "similarity_score": float(similarities[idx])   # optional: useful for debugging
        })

    return results

# Example usage
if __name__ == "__main__":
    query = "Looking for a sales manager with good communication and leadership"
    recommendations = recommend_assessments(query)
    for rec in recommendations:
        print(rec)


{'assessment_name': 'Account Manager Solution', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/account-manager-solution/', 'remote_testing': 'Yes', 'adaptive_support': 'Yes', 'duration': 'Not provided', 'test_type': 'C, P, A, B', 'similarity_score': 0.8335354235866881}
{'assessment_name': 'Accounts Receivable (New)', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/accounts-receivable-new/', 'remote_testing': 'Yes', 'adaptive_support': 'No', 'duration': 'Not provided', 'test_type': 'K', 'similarity_score': 0.8130644080931066}
{'assessment_name': 'Agency Manager Solution', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/agency-manager-solution/', 'remote_testing': 'Yes', 'adaptive_support': 'Yes', 'duration': 'Not provided', 'test_type': 'A, B, P, S', 'similarity_score': 0.8049733098488957}
{'assessment_name': 'Branch Manager - Short Form', 'url': 'https://www.shl.com/solutions/products/product-catalog/view/branch-manager-short

In [1]:
# Sample test queries with expected relevant keywords
test_queries = [
    {
        "query": "Looking to hire a data analyst with strong cognitive and logical skills",
        "relevant_ids": ["Account", "Administrative", "Professional", "Receivable", "Payable"]
    },
    {
        "query": "Hiring for a customer service executive with good communication and empathy",
        "relevant_ids": ["Administrative", "Account Manager"]
    },
    {
        "query": "Need a coding assessment for Java and Python",
        "relevant_ids": [".NET", "Framework", "MVC", "Developer"]
    },
    {
        "query": "Searching for leadership potential test for team leads",
        "relevant_ids": ["Manager", "Leadership", "Global Skills"]
    },
    {
        "query": "Need an adaptive and remote test for fresh graduates",
        "relevant_ids": ["Apprentice", "Graduate", "Development"]
    }
]


# Function to compute Recall@K
def compute_recall_at_k(results, relevant, k):
    retrieved = results[:k]
    relevant_hits = sum(
        any(rel.lower() in r['assessment_name'].lower() or rel.lower() in r['test_type'].lower()
            for rel in relevant)
        for r in retrieved
    )
    total_relevant = len(relevant)
    return relevant_hits / total_relevant if total_relevant else 0

# Function to compute MAP@K
def compute_map_at_k(results, relevant, k):
    hits = 0
    sum_precisions = 0

    for i in range(min(k, len(results))):
        r = results[i]
        is_relevant = any(
            rel.lower() in r['assessment_name'].lower() or rel.lower() in r['test_type'].lower()
            for rel in relevant
        )
        if is_relevant:
            hits += 1
            sum_precisions += hits / (i + 1)

    return sum_precisions / min(len(relevant), k) if relevant else 0

# Master evaluation function
def evaluate_model(k=5):
    total_recall = 0
    total_map = 0
    N = len(test_queries)

    for i, test in enumerate(test_queries, 1):
        query = test["query"]
        relevant = test["relevant_ids"]
        results = recommend_assessments(query)

        # Debug: Print actual top 5 assessment names returned
        print(f"\n[{i}] Query: {query}")
        for res in results[:k]:
            print("   →", res['assessment_name'])

        recall = compute_recall_at_k(results, relevant, k)
        ap = compute_map_at_k(results, relevant, k)

        print(f"→ Recall@{k}: {recall:.2f}")
        print(f"→ AP@{k}:     {ap:.2f}")

        total_recall += recall
        total_map += ap

    mean_recall = total_recall / N
    mean_ap = total_map / N

    print(f"\n📊 Mean Recall@{k}: {mean_recall:.2f}")
    print(f"📊 MAP@{k}:         {mean_ap:.2f}")

if __name__ == "__main__":
    evaluate_model(k=5)

NameError: name 'recommend_assessments' is not defined