In [1]:
from transformers import AutoModel, AutoTokenizer
import torch

# Model name
model_name = "Linq-AI-Research/Linq-Embed-Mistral"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
import pandas as pd
df_products = pd.read_csv("final_products.csv")
df_products.head(2)

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,category_id,isBestSeller,boughtInLastMonth,id,category_name,llava_generated_image_caption
0,B08ZDQX51W,Original Replacement Dell 130W Laptop Charger ...,https://m.media-amazon.com/images/I/61sADwl+YW...,https://www.amazon.com/dp/B08ZDQX51W,4.5,0,24.98,65,False,0,65,Laptop Accessories,"A black power bank, which is a portable charge..."
1,B01BPCTXHC,Griffin Elevator Stand for Laptops - Lift Your...,https://m.media-amazon.com/images/I/710N2S69Nv...,https://www.amazon.com/dp/B01BPCTXHC,4.6,0,35.0,65,False,0,65,Laptop Accessories,A laptop computer sitting on a stand or a dock...


In [3]:
result_array = df_products.apply(lambda row: f"Title of Product: {row['title']}\nProduct Image Description: {row['llava_generated_image_caption']}\nProduct Category: {row['category_name']}", axis=1).tolist()
result_array[:2]

['Title of Product: Original Replacement Dell 130W Laptop Charger USB C Slim AC Power Adapter for Dell Xps 17,Precision 5550 5530 2in1,XPS 15 2in1 9575，DA130PM170 HA130PM170 0K00F5 K00F5 0M0H25 M0H25 T4V18\nProduct Image Description: A black power bank, which is a portable charger used to charge electronic devices.\nProduct Category: Laptop Accessories',
 'Title of Product: Griffin Elevator Stand for Laptops - Lift Your Laptop to a Comfortable Viewing Height, Space Grey\nProduct Image Description: A laptop computer sitting on a stand or a docking station.\nProduct Category: Laptop Accessories']

In [None]:
import torch
import torch.nn.functional as F
import numpy as np

def last_token_pool(last_hidden_states, attention_mask):
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_embeddings(texts, model, tokenizer, max_length=4096, batch_size=32):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]

        batch_dict = tokenizer(batch_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
        batch_dict = {k: v.to(model.device) for k, v in batch_dict.items()}

        # Get model outputs
        with torch.no_grad():
            outputs = model(**batch_dict)

        # Extract last token embeddings
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)

        # Convert to numpy and store
        all_embeddings.append(embeddings.cpu().numpy())

    return np.vstack(all_embeddings)


batch_size = 16
embeddings = get_embeddings(result_array, model, tokenizer, batch_size=batch_size)

print(embeddings.shape)
print(embeddings)


(10200, 4096)
[[-0.00223  -0.01529  -0.01616  ...  0.01277  -0.03455  -0.00944 ]
 [ 0.005222 -0.002394  0.02354  ...  0.011246 -0.0284   -0.01431 ]
 [-0.00896   0.00813   0.01146  ...  0.003876 -0.02586  -0.006336]
 ...
 [-0.01182   0.0189    0.008095 ... -0.003273 -0.01124   0.001532]
 [-0.00619   0.004642  0.01692  ... -0.004555 -0.00444  -0.004665]
 [-0.01607   0.00909   0.012535 ... -0.00455  -0.00888   0.01744 ]]


In [5]:
print(tokenizer.model_max_length)

1000000000000000019884624838656


In [6]:
print(model.config.hidden_size)

4096


In [7]:
embeddings.shape[1]

4096

In [8]:
import faiss
import numpy as np
import torch.nn.functional as F

# Convert embeddings to float32
embeddings_f32 = np.ascontiguousarray(embeddings.astype(np.float32))

# Normalize embeddings for cosine similarity
embeddings_f32 = F.normalize(torch.tensor(embeddings_f32), p=2, dim=1).numpy()

# Create FAISS index for Inner Product (Cosine Similarity)
index = faiss.IndexFlatIP(embeddings_f32.shape[1])

# Add embeddings to index
index.add(embeddings_f32)

print(f"Number of vectors in the index: {index.ntotal}")


Number of vectors in the index: 10200


In [None]:
faiss.write_index(index, "Text Vector Store/vector_store.faiss")

In [None]:
index = faiss.read_index("Text Vector Store/vector_store.faiss")

In [None]:
# New input product description
input_text = ["laptop charger"]
input_embedding = get_embeddings(input_text, model, tokenizer)

# Normalize input embedding before searching
input_embedding_f32 = np.ascontiguousarray(input_embedding.astype(np.float32))
input_embedding_f32 = F.normalize(torch.tensor(input_embedding_f32), p=2, dim=1).numpy()

# Define a similarity threshold (adjust based on your data)
SIMILARITY_THRESHOLD = 0.38  # Lower scores mean low similarity

# Perform the search
D, I = index.search(input_embedding_f32, k=5)  # Get top 5 matches

# Check if the best match is above the threshold
if D[0][0] < SIMILARITY_THRESHOLD:
    print("No relevant match found.")
    print("Similarity scores:", D)
else:
    print("Most similar product indices:", I)
    print("Similarity scores:", D)


Most similar product indices: [[324 261 269]]
Similarity scores: [[0.6473319  0.6468511  0.64471555]]
