In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
# Load your dataset
df = pd.read_csv("test.csv", on_bad_lines="skip")

# Extract product display names
product_display_names = df['productDisplayName'].tolist()

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
# Define a function to tokenize and encode text
def encode_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:tokenizer.model_max_length - 2]  # Limiting to BERT's maximum input length
    input_ids = tokenizer.encode(tokens, add_special_tokens=True)
    return input_ids

In [None]:
# Encode product display names
encoded_product_display_names = [encode_text(name) for name in product_display_names]

In [None]:
encoded_product_display_names = model.encode(product_display_names)

In [None]:
# Encode user query
user_query = "red shoes"
encoded_user_query = encode_text(user_query)

In [None]:
# Calculate similarity
similarities = []
with torch.no_grad():
    user_query_tensor = torch.tensor(encoded_user_query).unsqueeze(0)
    user_query_embedding = model(user_query_tensor)[0][:, 0, :].numpy()  # Take the embedding of [CLS] token
    for product_name in encoded_product_display_names:
        product_tensor = torch.tensor(product_name).unsqueeze(0)
        product_embedding = model(product_tensor)[0][:, 0, :].numpy()
        similarity_score = cosine_similarity(user_query_embedding, product_embedding)[0][0]
        print(similarity_score,end=" ")
        similarities.append(similarity_score)

In [None]:
# Calculate similarity
similarities = cosine_similarity([encoded_user_query], encoded_product_display_names)[0]

# Rank results
k=5
top_k_indices = similarities.argsort()[-k:][::-1]
top_k_products = [product_display_names[index] for index in top_k_indices]

In [None]:
# Rank results
k=5
similarity_scores_with_indices = list(enumerate(similarities))
sorted_similarity_scores_with_indices = sorted(similarity_scores_with_indices, key=lambda x: x[1], reverse=True)
top_k_indices = [index for index, _ in sorted_similarity_scores_with_indices[:k]]
top_k_products = [product_display_names[index] for index in top_k_indices]

In [None]:
# Print top k most similar products
for i, product_name in enumerate(top_k_products):
    print(f"{i+1}. {product_name}")