In [1]:
!pip install pandas



In [2]:
!pip install sentence_transformers



In [3]:
!pip install scikit-learn



In [4]:
!pip install tqdm



In [6]:
import pandas as pd

df = pd.read_csv(r'D:/Semantic_Search_Engine/Data/cleaned_dataset.csv')
titles = df['title'].values

In [7]:
titles[:15]

array(['OnePlus Nord CE 3 5G | 12GB RAM and 256GB ROM | 6.7 inch Fluid AMOLED 120 Hz Display | 80W SuperVOOC Charge',
       'OnePlus Nord CE3 5G |12GB RAM, 256 GB Storage | Snapdragon™ 782G | 80W SUPERVOOC Charge | 17.02cm 120 Hz Fluid AMOLED + Dual Speakers',
       'OnePlus Nord N30SE 5G | 4GB RAM, 128GB Storage | 5000 mAh Battery',
       'Benco S1 (8GB+128GB) || Fingerprint || 48MP + 2MP + AI Camera || 5000Mah Battery',
       'OnePlus Nord N30SE 5G (4/128GB) | 6.72" FHD+ Sunlight Display | 5000 mAh Battery',
       'Apple iPhone 15 Pro Max - EvoStore',
       'Tecno Spark 20 Pro+ (16*/256 GB) | 6.78" FHD + AMOLED Curved Screen | 120Hz Refresh Rate | 100 Days Replacement Warranty | G99 Ultra Boost Processor | 108MP Ultra Sensing Main Camera | 5000mAh Battery | 33W Super Charge',
       'Oneplus Nord N30 SE 5G || 4/128 GB || 5000 mAh Battery',
       'Oneplus Nord CE 3 5G || 12/256GB || Snapdragon 782G Chipset',
       'Xiaomi Redmi 13C 4G LTE (4/128GB) | 6.74in Display | MediaTek 

In [8]:
len(titles)

360

In [9]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# title_embeddings = model.encode(titles)

In [10]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np


# Load the pre-trained BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define batch size
batch_size = 32  

# Initialize a list to store the embeddings
title_embeddings = []

# Use tqdm to create a progress bar for batches
for i in tqdm(range(0, len(titles), batch_size), desc="Encoding titles", unit="batch"):
    # Get the current batch of titles
    batch_titles = titles[i:i + batch_size]
    
    # Encode the batch of titles and handle any potential errors
    try:
        embeddings = model.encode(batch_titles, show_progress_bar=False)
        # Append the embeddings to the list
        title_embeddings.extend(embeddings)
    except Exception as e:
        print(f"Error encoding titles {i} to {i + batch_size}: {e}")

# Convert the list to a numpy array for easier manipulation later
title_embeddings = np.array(title_embeddings)

# Output the shape of the embeddings to verify
print(f"Shape of title embeddings: {title_embeddings.shape}")


  from tqdm.autonotebook import tqdm, trange
Encoding titles: 100%|██████████| 12/12 [00:22<00:00,  1.84s/batch]

Shape of title embeddings: (360, 384)





In [11]:
# Save the embeddings to a file
np.save('title_embeddings.npy', title_embeddings)


In [12]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def search(query, title_embeddings, titles, df, top_n=5):
#     """
#     Function to search for the most similar product titles.
    
#     Args:
#     - query (str): Search query.
#     - title_embeddings (ndarray): Precomputed embeddings of product titles.
#     - titles (list): List of product titles.
#     - df (DataFrame): Original DataFrame containing all product information.
#     - top_n (int): Number of top results to return.
    
#     Returns:
#     - List of tuples with product details and similarity scores.
#     """
#     # Convert the search query into an embedding
#     query_embedding = model.encode([query])

#     # Compute cosine similarities between query and all product title embeddings
#     similarities = cosine_similarity(query_embedding, title_embeddings)[0]

#     # Get indices of top N most similar products
#     top_indices = np.argsort(similarities)[::-1][:top_n]

#     # Fetch product details from the original DataFrame for the top results
#     results = []
#     for idx in top_indices:
#         product_info = {
#             'title': df.iloc[idx]['title'],
#             'price': df.iloc[idx]['price'],
#             'rating': df.iloc[idx]['rating'],
#             'sold': df.iloc[idx]['sold'],
#             'offer': df.iloc[idx]['offer'],
#             'link': df.iloc[idx]['full_link'],
#             'similarity': similarities[idx]
#         }
#         results.append(product_info)

#     return results


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, title_embeddings, titles, df, model, top_n=5):
    """
    Function to search for the most similar product titles.
    
    Args:
    - query (str): Search query.
    - title_embeddings (ndarray): Precomputed embeddings of product titles.
    - titles (list): List of product titles.
    - df (DataFrame): Original DataFrame containing all product information.
    - model: The model used to encode the query.
    - top_n (int): Number of top results to return.
    
    Returns:
    - List of product details with similarity scores or error message.
    """
    # Convert the search query into an embedding
    query_embedding = model.encode([query])

    # Compute cosine similarities between query and all product title embeddings
    similarities = cosine_similarity(query_embedding, title_embeddings)[0]

    # Get indices of top N most similar products
    top_indices = np.argsort(similarities)[::-1][:top_n]

    # Check if the highest similarity score is below the threshold
    if similarities[top_indices[0]] < 0.45:
        return {'error': 'No relevant results found for the given query.'}

    # Fetch product details from the original DataFrame for the top results
    results = []
    for idx in top_indices:
        product_info = {
            'title': df.iloc[idx]['title'],
            'price': df.iloc[idx]['price'],
            'rating': df.iloc[idx]['rating'],
            'sold': df.iloc[idx]['sold'],
            'offer': df.iloc[idx]['offer'],
            'link': df.iloc[idx]['full_link'],
            'similarity': similarities[idx]
        }
        results.append(product_info)

    return results


In [14]:
# Sample search query
query = "OnePlus 5G smartphone"

# Get the top 5 search results, assuming you have already defined `model`
search_results = search(query, title_embeddings, titles, df, model)

# Check if there's an error in the results
if 'error' in search_results:
    print(search_results['error'])
else:
    # Print the search results
    for result in search_results:
        print(f"Title: {result['title']}")
        print(f"Price: {result['price']}")
        print(f"Rating: {result['rating']}")
        print(f"Sold Units: {result['sold']}")
        print(f"Offer: {result['offer']}")
        print(f"Product Link: {result['link']}")
        print(f"Similarity Score: {result['similarity']:.4f}")
        print("-" * 80)


Title: OnePlus Nord 2T 5G Mobile phone ( 6.43 inch fluid Amoled Display 80W SuperVOOC Charging )
Price: 64999.0
Rating: 2.0
Sold Units: 12.0
Offer: nan
Product Link: https://www.daraz.com.np/products/oneplus-nord-2t-5g-mobile-phone-643-inch-fluid-amoled-display-80w-supervooc-charging-i114806947.html
Similarity Score: 0.6551
--------------------------------------------------------------------------------
Title: Q413 SMART PHONE
Price: 5600.0
Rating: 3.0
Sold Units: 15.0
Offer: 42% Off
Product Link: https://www.daraz.com.np/products/q413-smart-phone-i129005432.html
Similarity Score: 0.5689
--------------------------------------------------------------------------------
Title: Q413 SMART PHONE
Price: 5600.0
Rating: 3.0
Sold Units: 15.0
Offer: 42% Off
Product Link: https://www.daraz.com.np/products/q413-smart-phone-i129005432.html
Similarity Score: 0.5689
--------------------------------------------------------------------------------
Title: ITEL A60s 4GB+4GB RAM 128GB Storage Mobile Smart