In [28]:
import pandas as pd

In [2]:
df = pd.read_csv("test_data.xls")
print(df.columns)

Index(['Test_id', 'Test_Name', 'Test_Link', 'Remote_Testing',
       'Adaptive_Testing', 'Test_Types', 'Description', 'Job_Levels',
       'Languages', 'Assessment_Length'],
      dtype='object')


In [3]:
print(df.head())

   Test_id                         Test_Name  \
0     4302  Global Skills Development Report   
1     3827                .NET Framework 4.5   
2     4094                    .NET MVC (New)   
3     4099                   .NET MVVM (New)   
4     4018                    .NET WCF (New)   

                                           Test_Link Remote_Testing  \
0  /solutions/products/product-catalog/view/globa...            YES   
1  /solutions/products/product-catalog/view/net-f...            YES   
2  /solutions/products/product-catalog/view/net-m...            YES   
3  /solutions/products/product-catalog/view/net-m...            YES   
4  /solutions/products/product-catalog/view/net-w...            YES   

  Adaptive_Testing                                         Test_Types  \
0               NO  ['Ability & Aptitude', 'BioData & Situational ...   
1              YES                             ['Knowledge & Skills']   
2               NO                             ['Knowledge & Skil

In [4]:
print(df["Test_Link"].iloc[0])

/solutions/products/product-catalog/view/global-skills-development-report/


# Data preprocessing

### Adding full url to the test link

In [5]:
df["Test_Link"] = df["Test_Link"].apply(lambda x: "https://www.shl.com" + x if not x.startswith("https://www.shl.com") else x)
print(df["Test_Link"].iloc[0])

https://www.shl.com/solutions/products/product-catalog/view/global-skills-development-report/


### Handling None

In [6]:
df['Languages'] = df['Languages'].replace("NONE", "No info")

In [7]:
df.fillna("No info", inplace=True)
df.replace("NONE", "No info", inplace=True)

### Converting list like strings to text

In [8]:
df['Test_Types'] = df['Test_Types'].apply(lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith("[") else x)

In [9]:
import re

def extract_duration(length_str):
    if isinstance(length_str, str):
        match = re.search(r'\d+', length_str)
        return int(match.group()) if match else None
    return None


In [10]:

df['Assessment_Minutes'] = df['Assessment_Length'].apply(extract_duration)

# Replace NaN values with "No info"
df['Assessment_Minutes'] = df['Assessment_Minutes'].fillna("No info")

# Output the resulting DataFrame
print(df['Assessment_Minutes'])

0      No info
1         30.0
2         17.0
3          5.0
4         11.0
        ...   
367       12.0
368        9.0
369       15.0
370       15.0
371       15.0
Name: Assessment_Minutes, Length: 372, dtype: object


### making column if short_test : if greater than 30 will give long-test ,if some one mentions short test then will give more priority to them

In [11]:
df["short_test"] = df["Assessment_Minutes"].apply(
    lambda x: "short test" if (isinstance(x, int) and x <= 30) or x == "No info" else "long test"
)

In [12]:
df.head()

Unnamed: 0,Test_id,Test_Name,Test_Link,Remote_Testing,Adaptive_Testing,Test_Types,Description,Job_Levels,Languages,Assessment_Length,Assessment_Minutes,short_test
0,4302,Global Skills Development Report,https://www.shl.com/solutions/products/product...,YES,NO,"Ability & Aptitude, BioData & Situational Judg...",This report is designed to be given to individ...,"Director, Entry-Level, Executive, General Popu...",No info,No info,No info,short test
1,3827,.NET Framework 4.5,https://www.shl.com/solutions/products/product...,YES,YES,Knowledge & Skills,The.NET Framework 4.5 test measures knowledge ...,"Professional Individual Contributor, Mid-Profe...","English (USA),",Approximate Completion Time in minutes = 30,30.0,long test
2,4094,.NET MVC (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 17,17.0,long test
3,4099,.NET MVVM (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,5.0,long test
4,4018,.NET WCF (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 11,11.0,long test


## Now lets make the combined column to feed in sentence trasnformer

In [13]:
def combine_columns(row):
    parts = []

    # Always include name and description
    parts.append(f"{row['Test_Name']} - {row['Description']}")

    # Conditionally include test type
    if row['Test_Types'] != "No info":
        parts.append(f"Test Type: {row['Test_Types']}")

    # Conditionally include job levels
    if row['Job_Levels'] != "No info":
        parts.append(f"Target Job Levels: {row['Job_Levels']}")

    # Always include assessment length
    parts.append(f"Duration: {row['Assessment_Length']}")

    # Handle the case for Assessment_Minutes
    if row['Assessment_Minutes'] != "No info":
        parts.append(f"Duration in Minutes: {row['Assessment_Minutes']}")
    else:
        parts.append("Duration in Minutes: No info")

    # Conditionally include "Remote Testing" only if Yes
    if row['Remote_Testing'].strip().lower() == "yes":
        parts.append("Supports Remote Testing")
        
    ##Add short/long/unknown classification
    parts.append(row['short_test'])
    
    # Conditionally include "Adaptive Testing" only if Yes
    if row['Adaptive_Testing'].strip().lower() == "yes":
        parts.append("Supports Adaptive Testing")

    # Conditionally include language info
    if row['Languages'] != "No info":
        parts.append(f"Languages: {row['Languages']}")

    # Join all parts into one string
    return " | ".join(parts)


In [14]:
df['combined_text'] = df.apply(combine_columns, axis=1)

In [15]:
df.head()

Unnamed: 0,Test_id,Test_Name,Test_Link,Remote_Testing,Adaptive_Testing,Test_Types,Description,Job_Levels,Languages,Assessment_Length,Assessment_Minutes,short_test,combined_text
0,4302,Global Skills Development Report,https://www.shl.com/solutions/products/product...,YES,NO,"Ability & Aptitude, BioData & Situational Judg...",This report is designed to be given to individ...,"Director, Entry-Level, Executive, General Popu...",No info,No info,No info,short test,Global Skills Development Report - This report...
1,3827,.NET Framework 4.5,https://www.shl.com/solutions/products/product...,YES,YES,Knowledge & Skills,The.NET Framework 4.5 test measures knowledge ...,"Professional Individual Contributor, Mid-Profe...","English (USA),",Approximate Completion Time in minutes = 30,30.0,long test,.NET Framework 4.5 - The.NET Framework 4.5 tes...
2,4094,.NET MVC (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 17,17.0,long test,.NET MVC (New) - Multi-choice test that measur...
3,4099,.NET MVVM (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,5.0,long test,.NET MVVM (New) - Multi-choice test that measu...
4,4018,.NET WCF (New),https://www.shl.com/solutions/products/product...,YES,NO,Knowledge & Skills,Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 11,11.0,long test,.NET WCF (New) - Multi-choice test that measur...


### Generating embeddings using sentence transformer model - all-mpnet-base-v2

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")


In [17]:
df["combined_text"] 

0      Global Skills Development Report - This report...
1      .NET Framework 4.5 - The.NET Framework 4.5 tes...
2      .NET MVC (New) - Multi-choice test that measur...
3      .NET MVVM (New) - Multi-choice test that measu...
4      .NET WCF (New) - Multi-choice test that measur...
                             ...                        
367    Workplace Administration Skills (New) - Multi-...
368    Workplace Health and Safety (New) - Multi-choi...
369    WriteX - Email Writing (Customer Service) (New...
370    WriteX - Email Writing (Managerial) (New) - Op...
371    WriteX - Email Writing (Sales) (New) - Open re...
Name: combined_text, Length: 372, dtype: object

In [18]:
# Encode combined_text column
texts = df["combined_text"].tolist()
embeddings = model.encode(texts, show_progress_bar=True)


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

NameError: name 'np' is not defined

In [20]:
import numpy as np
# Convert to numpy array (FAISS requires float32)
embedding_matrix = np.array(embeddings).astype("float32")

print(embedding_matrix.shape)  # Should be (number of rows, 384)

(372, 768)


In [21]:
#Normalize for Cosine Similarity 
def normalize(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

embedding_matrix = normalize(embedding_matrix)

### Storing in faiss

In [22]:
import faiss
# Your embedding_matrix should be a NumPy array of shape (n, 384)
dimension = embedding_matrix.shape[1]  # 384 for MiniLM

# # We can Create index using L2 (Euclidean) distance or cosine simlarity but doing here with cosine as its better 
# index = faiss.IndexFlatL2(dimension)
index = faiss.IndexFlatIP(dimension)

# Add embeddings to the index
index.add(embedding_matrix)

print("Index contains", index.ntotal, "vectors")


Index contains 372 vectors


In [23]:
faiss.write_index(index, "shl_index.faiss")
df.to_csv("shl_final_catalog.csv", index=False)
np.save("shl_embeddings.npy", embedding_matrix)

In [27]:
# # This is useful so you don’t have to recompute embeddings every time.
# #later, reload with:

# index = faiss.read_index("shl_index.faiss")
# df = pd.read_csv("shl_final_catalog.csv")
# embedding_matrix = np.load("shl_embeddings.npy")

In [26]:
query = "Looking for a short test on leadership and communication for entry-level roles"
query_vector = model.encode([query]).astype("float32")

top_k = 10
distances, indices = index.search(query_vector, top_k)

# Show top 10 recommended tests
for idx in indices[0]:
    print(df.iloc[idx][["Test_Name", "Assessment_Length", "Test_Types", "Remote_Testing", "Test_Link","Description"]])
    print("-" * 60)


Test_Name                                 Interpersonal Communications
Assessment_Length    Approximate Completion Time in minutes = 15 to 35
Test_Types                                          Knowledge & Skills
Remote_Testing                                                     YES
Test_Link            https://www.shl.com/solutions/products/product...
Description          This adaptive test measures the candidate's kn...
Name: 145, dtype: object
------------------------------------------------------------
Test_Name                                      Business Communications
Assessment_Length          Approximate Completion Time in minutes = 35
Test_Types                                          Knowledge & Skills
Remote_Testing                                                     YES
Test_Link            https://www.shl.com/solutions/products/product...
Description          This test measures the candidate's knowledge o...
Name: 50, dtype: object
--------------------------------------

In [36]:
#testing the recommend endpoint

import requests

url = "https://shl-fastapi-uf7r.onrender.com/health-checkup"
data = {"query": "Looking for an assessment for a leadership role with remote testing"}

response = requests.post(url, json=data)

# Print the response
print(response.status_code)  # Check the status code
print(response.text)  # Print the raw response text



405
{"detail":"Method Not Allowed"}
