# Preparation

In [1]:
!pip install datasets
!pip install pandas
!pip install openai
!pip install tiktoken
!pip install matplotlib
!pip install requests
!pip install cohere
!pip install faiss-cpu
!pip install tqdm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.2/491.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [2]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import cohere
import openai
import os
import time
import json
import faiss
import numpy as np

In [3]:
# URL of the JSONL file
url = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Amazon_Fashion.jsonl"

# Local filename to save the file
output_file = "meta_Amazon_Fashion.jsonl"

# Send GET request
response = requests.get(url, stream=True)

# Check if request was successful
if response.status_code == 200:
    with open(output_file, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"File downloaded successfully and saved as '{output_file}'")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

File downloaded successfully and saved as 'meta_Amazon_Fashion.jsonl'


## OpenAI and Cohere API setups

In [4]:
from google.colab import userdata
openai.api_key = userdata.get('openai')

In [5]:
os.environ["COHERE_API_KEY"] = userdata.get('cohere')

## Data preparation

In [6]:
EXTRACT_SIZE = 2000

In [7]:
# Path to the JSONL file
file_path = "meta_Amazon_Fashion.jsonl"

# Load JSONL into a pandas DataFrame
#extract_df = pd.read_json(file_path, lines=True)

# Read the first lines manually
with open(file_path, 'r', encoding='utf-8') as f:
    lines = [next(f) for _ in range(EXTRACT_SIZE)]

# Convert the lines (as strings) into a DataFrame
extract_df = pd.read_json(''.join(lines), lines=True)


  extract_df = pd.read_json(''.join(lines), lines=True)


In [8]:
extract_df['title_array'] = extract_df['title'].apply(lambda x: [x])

In [9]:
extract_df['concatenated_arrays'] = (
    extract_df
    .apply(
        lambda row: row['title_array'] + row['description'] + row['features'],
        axis=1
    )
)

In [10]:
extract_df["chunk"] = extract_df['concatenated_arrays'].apply(lambda x: ' '.join(x))

In [11]:
extract_df.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,title_array,concatenated_arrays,chunk
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,[],{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,,[YUEDGE 5 Pairs Men's Moisture Control Cushion...,[YUEDGE 5 Pairs Men's Moisture Control Cushion...,YUEDGE 5 Pairs Men's Moisture Control Cushione...
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"[Drawstring closure, Machine Wash]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,[],{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,,[DouBCQ Women's Palazzo Lounge Wide Leg Casual...,[DouBCQ Women's Palazzo Lounge Wide Leg Casual...,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...


In [12]:
# Create the dictionary
chunks_dict = {np.int64(idx): desc for idx, desc in extract_df['chunk'].items()}
data = list(chunks_dict.values())
ids = np.array(list(chunks_dict.keys()))

In [13]:
chunks_dict[0]

"YUEDGE 5 Pairs Men's Moisture Control Cushioned Dry Fit Casual Athletic Crew Socks for Men (Blue, Size 9-12)"

# Proto

## Create vector DB

In [14]:
embed_model="text-embedding-3-large"
dimension = 3072
index_flat = faiss.IndexFlatL2(dimension)  # L2 (Euclidean) distance
index = faiss.IndexIDMap(index_flat)       # Wrap to support custom IDs

In [15]:
def embed(batch: list[str]) -> list[float]:
    # create embeddings (exponential backoff to avoid RateLimitError)
    for j in range(5):  # max 5 retries
        try:
            res = openai.embeddings.create(
                input=batch,
                model=embed_model
            )
            passed = True
        except openai.RateLimitError:
            time.sleep(2**j)  # wait 2^j seconds before retrying
            print("Retrying...")
    if not passed:
        raise RuntimeError("Failed to create embeddings.")
    # get embeddings
    embeds = np.array([record.embedding for record in res.data])
    return embeds

In [16]:
from tqdm.auto import tqdm
from datetime import datetime

batch_size = 1000  # how many embeddings we create and insert at once
count_batch = 1
for i in tqdm(range(0, len(data), batch_size)):
    passed = False
    # find end of batch
    i_end = min(len(data), i+batch_size)
    # create batch
    batch = data[i:i_end]
    print(batch[0])
    print(len(batch))
    embeds = embed(batch)
    # Final Step: Add vectors with corresponding IDs
    index.add_with_ids(embeds, ids[i:i_end])
    # Checkpointing and saving
    if count_batch % 100 == 0:
      current_time = datetime.now().strftime("%Y%m%d%H%M%S")
      index_path = "indices/checkpoint_" + str(count_batch) + "_faiss_size_" + str(len(data)) + "_" + embed_model + "_" + current_time + ".index"
      faiss.write_index(index, index_path)
    count_batch += 1


# Save the index to a file
# Current time
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
index_path = "indices/faiss_size_" + str(len(data)) + "_" + embed_model + "_" + current_time + ".index"
faiss.write_index(index, index_path)

  0%|          | 0/2 [00:00<?, ?it/s]

YUEDGE 5 Pairs Men's Moisture Control Cushioned Dry Fit Casual Athletic Crew Socks for Men (Blue, Size 9-12)
1000
YEAQING Women Jogger Sweatpants Drawstring Workout Running Cargo Pants High Waisted Lounge Pants with Pockets Drawstring closure
1000


In [17]:
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
index_path = "indices/checkpoint_" + str(count_batch) + "_faiss_size_" + str(len(data)) + "_" + embed_model + "_" + current_time + ".index"
faiss.write_index(index, index_path)

## Retriever

In [18]:
def get_docs(query: str, top_k: int) -> list[str]:
    # encode query
    xq = embed([query])
    # search pinecone index
    D, I = index.search(xq, k=top_k) # distance, index
    # get doc text
    doc_to_index_id = {chunks_dict[idx]: (rank, idx) for rank, idx in enumerate(I.tolist()[0])}
    return doc_to_index_id

In [19]:
query = "I want a nice beige dress for a wedding."
doc_to_index_id = get_docs(query, top_k=50)

## Reranker

In [20]:
co = cohere.Client(os.environ["COHERE_API_KEY"])

In [21]:
documents=list(doc_to_index_id.keys())

In [22]:
rerank_docs = co.rerank(
    query=query, documents=documents, top_n=5, model="rerank-english-v2.0"
)

In [23]:
# Display the reranking results
def return_results(results, documents, doc_to_index_id):
    for idx, result in enumerate(results.results):
        print(f"Rank: {idx+1}")
        print(f"Rank before rerank: {result.index}")
        print(f"Rank before rerank: {doc_to_index_id[documents[result.index]][0]+1}")
        print(f"Score: {result.relevance_score}")
        print(f"Document: {documents[result.index]}")
        print(f"Index in Vectr DB: {doc_to_index_id[documents[result.index]][1]}")
        print("============================================================")
        print("VALIDATE POSITION IN DB")
        print(documents[result.index]==data[doc_to_index_id[documents[result.index]][1]])
        print("============================================================")
        print("\n")
return_results(rerank_docs, documents, doc_to_index_id)

Rank: 1
Rank before rerank: 1
Rank before rerank: 2
Score: 0.58302534
Document: Floerns Women's Short Sleeve Flower Casual T-shirt Dress Beige XS Pull On closure Hand Wash Only
Index in Vectr DB: 1572
VALIDATE POSITION IN DB
True


Rank: 2
Rank before rerank: 3
Rank before rerank: 4
Score: 0.29279748
Document: Womens Casual T Shirt Dress 3/4 Sleeve Floral Loose Shift Dress Tunic Sundress 2X-Large Black Beige
Index in Vectr DB: 161
VALIDATE POSITION IN DB
True


Rank: 3
Rank before rerank: 34
Rank before rerank: 35
Score: 0.21452682
Document: Lover Kiss Women's Mother Of The Bride Maxi Formal Evening Gown 10 Dusty Thistle Zipper closure
Index in Vectr DB: 1482
VALIDATE POSITION IN DB
True


Rank: 4
Rank before rerank: 35
Rank before rerank: 36
Score: 0.14817041
Document: GRACE KARIN 40s Retro Summer Party Dress with Belt Knee Length Wedding Swing Dress Black XL Hand Wash Only
Index in Vectr DB: 84
VALIDATE POSITION IN DB
True


Rank: 5
Rank before rerank: 19
Rank before rerank: 20
Score