## Similarity Search

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [14]:
data = pd.read_excel('/content/Technology Skills.xlsx')

In [17]:
unique_skills = list(set(data['Example'].tolist()))

def get_related_skills(user_query, unique_skills, model, threshold=0.8):
    skill_embeddings = model.encode(unique_skills)
    query_embedding = model.encode(user_query)

    cos_similarities = np.dot(query_embedding, skill_embeddings.T) / (np.linalg.norm(query_embedding) * np.linalg.norm(skill_embeddings, axis=1))
    related_indices = np.where(cos_similarities > threshold)[0]
    related_skills = [unique_skills[idx] for idx in related_indices]

    return related_skills

In [18]:
user_query = "Java"
related_skills_java = get_related_skills(user_query, unique_skills, model)

print(related_skills_java)

['Oracle Java', 'Sun Microsystems Java', 'Oracle JavaServer Pages JSP', 'Xerces2 Java Parser', 'JavaScript', 'JavaScript framework software', 'JavaScript Object Notation JSON', 'Oracle Java Message Service JMS', 'Enterprise JavaBeans', 'YourKit Java Profiler', 'Javamin Composer']


## Statistics - Single function

In [None]:
!pip install sentence-transformers

In [39]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import time

In [40]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
data = pd.read_excel('/content/Technology Skills.xlsx')




In [41]:
def get_related_skills(user_query, data, model, threshold=0.8):
    unique_skills = list(set(data['Example'].tolist()))
    skill_embeddings = model.encode(unique_skills)
    query_embedding = model.encode(user_query)

    start_time_conversion = time.time()
    cos_similarities = np.dot(query_embedding, skill_embeddings.T) / (np.linalg.norm(query_embedding) * np.linalg.norm(skill_embeddings, axis=1))
    end_time_conversion = time.time()

    start_time_search = time.time()
    related_indices = np.where(cos_similarities > threshold)[0]
    related_skills = [unique_skills[idx] for idx in related_indices]
    end_time_search = time.time()

    time_conversion = end_time_conversion - start_time_conversion
    time_search = end_time_search - start_time_search

    return related_skills, time_conversion, time_search

user_query = "Java"
related_skills_java, time_conversion, time_search = get_related_skills(user_query, data, model)

print("Related Skills for query 'Java':", related_skills_java)
print('\n')
print("Time taken for converting the dataset into embeddings:", time_conversion, "seconds")
print('\n')
print("Time taken for searching the query with all entries:", time_search, "seconds")
print('\n')
print("Number of entries in the sample dataset:", num_entries)

Related Skills for query 'Java': ['Enterprise JavaBeans', 'Javamin Composer', 'Oracle JavaServer Pages JSP', 'JavaScript framework software', 'Oracle Java', 'Xerces2 Java Parser', 'YourKit Java Profiler', 'JavaScript Object Notation JSON', 'JavaScript', 'Oracle Java Message Service JMS', 'Sun Microsystems Java']


Time taken for converting the dataset into embeddings: 0.019329071044921875 seconds


Time taken for searching the query with all entries: 6.270408630371094e-05 seconds


Number of entries in the sample dataset: 32470


## Statistics - Two separate functions for conversion and searching


In [42]:
model = SentenceTransformer('all-MiniLM-L6-v2')
data = pd.read_excel('/content/Technology Skills.xlsx')



In [None]:
def convert_dataset_to_embeddings(data, model):
    unique_skills = list(set(data['Example'].tolist()))
    start_time_conversion = time.time()
    skill_embeddings = model.encode(unique_skills)
    end_time_conversion = time.time()
    time_conversion = end_time_conversion - start_time_conversion
    return skill_embeddings, time_conversion


In [45]:
def search_query_in_embeddings(user_query, skill_embeddings, unique_skills, threshold=0.6):
    query_embedding = model.encode(user_query)
    start_time_search = time.time()
    cos_similarities = np.dot(query_embedding, skill_embeddings.T) / (np.linalg.norm(query_embedding) * np.linalg.norm(skill_embeddings, axis=1))
    related_indices = np.where(cos_similarities > threshold)[0]
    related_skills = [unique_skills[idx] for idx in related_indices]
    end_time_search = time.time()
    time_search = end_time_search - start_time_search
    return related_skills, time_search


In [47]:
skill_embeddings, time_conversion = convert_dataset_to_embeddings(data, model)

user_query = "Java"
related_skills_java, time_search = search_query_in_embeddings(user_query, skill_embeddings, list(set(data['Example'].tolist())))

print("Related Skills for query 'Java':", related_skills_java)
print("Time taken for converting the dataset into embeddings:", time_conversion, "seconds")
print("Time taken for searching the query with all entries:", time_search, "seconds")
print("Number of entries in the sample dataset:", num_entries)

Related Skills for query 'Java': ['Enterprise JavaBeans', 'Oracle Java', 'Sun Microsystems Java']
Time taken for converting the dataset into embeddings: 3.9604694843292236 seconds
Time taken for searching the query with all entries: 0.0059392452239990234 seconds
Number of entries in the sample dataset: 32470
