In [2]:
!pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  pinecone-client==2.2.2 \
  openai==1.3.2 \
  tiktoken==0.5.1 \
  langchain==0.0.336

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.1/179.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.3/220.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.2 MB/s[0m e

# Creating Embeddings with an OpenAI Model

Load our dataset:

In [3]:
import pandas as pd

wine = pd.read_csv('wine_cleaned_rev_concat.csv')

# Also fill NaN values, because Pinecone cannot handle NaNs in the metadata
wine.fillna("unknown", inplace=True)

# Convert 'price' to a string since it contains both numbers and 'unknown'.
# This conversion is necessary to include such pieces of metadata in the Pinecone index, otherwise there seems to be an error arising
wine['price'] = wine['price'].astype(str)


We will use the `text-embedding-ada-002` embedding model from OpenAI. The maximum number of tokens that it can handle is 8192. Let's check if any of our descriptions has more than 8192 tokens. If not, then no chunking will be necessary.

In [4]:
# Find the index of the row with the longest string in 'description'
max_description_index = wine['description'].str.len().idxmax()

# Get the row with the longest string in 'description'
row_with_longest_description = wine.loc[max_description_index]

# Get the result
len(row_with_longest_description.description)

1854

The longest description in terms of characters is 1854. It's more or less clear that there will be many fewer tokens than that in the longest description, but let's still check this.

In [5]:
import os
import openai

# get API key from OpenAI website
OPENAI_API_KEY = "OPENAI_API_KEY"

openai.api_key = os.getenv("OPENAI_API_KEY") or OPENAI_API_KEY

Import the tiktoken tokenizer:

In [6]:
import tiktoken

tiktoken.encoding_for_model('gpt-4-1106-preview')

<Encoding 'cl100k_base'>

Define the function that outputs the token length of a given string:

In [7]:
tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("Hi, I'm a piece of text.")

9

See what the token length of our longest description is:

In [8]:
tiktoken_len(row_with_longest_description.description)

426

This is much smaller than 8192, so we don't need to split our descriptions into chunks. Now let's create embeddings for every description and save them to a tensor file. First, define the embedding model:

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed_model = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

See how it works:

In [10]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 1536.


Now we're ready to create a tensor file:

In [15]:
import math
import torch
from tqdm import tqdm

# Extract the "description" column as a list
descriptions = wine['description'].tolist()

# Set batch size
batch_size = 100

# Calculate the number of batches needed
num_batches = math.ceil(len(descriptions) / batch_size)

# Initialize a list to store embeddings
description_embeddings = []

# Process data in batches with tqdm
for i in tqdm(range(num_batches), desc="Processing Batches"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(descriptions))
    batch_descriptions = descriptions[start_idx:end_idx]

    # Encode the batch of descriptions using the model
    batch_embeddings = embed_model.embed_documents(batch_descriptions)

    # Append the batch embeddings to the list
    description_embeddings.extend(batch_embeddings)

# Convert the list of embeddings to a PyTorch tensor
description_tensor = torch.tensor(description_embeddings)

# Save the tensor to a file
torch.save(description_tensor, 'description_embeddings_openai_ada-002.pt')


Processing Batches: 100%|██████████| 846/846 [17:18<00:00,  1.23s/it]


# Sending Embeddings and Metadata to Pinecone

We can now send these embeddings, along with the corresponding metadata, to Pinecone. First, initialize access to Pinecone:

In [12]:
import os
import pinecone
from tqdm import tqdm

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'
)

See what indexes exist already:

In [15]:
pinecone.list_indexes()

[]

Create a new index (if not already):

In [16]:
import time

index_name = 'rag-openai'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

Now connect to the index:

In [20]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Use the tensor file to send embeddings for 'description' to Pinecone. Also include the corresponding pieces of metadata:

In [21]:
import torch
from tqdm import tqdm

# Load the PyTorch tensor from the tensor file
description_tensor = torch.load('description_embeddings_openai_ada-002.pt')

# Define the batch size
batch_size = 32

# Create a loop to process data in batches
for i in tqdm(range(0, len(description_tensor), batch_size)):
    i_end = min(len(description_tensor), i + batch_size)
    batch = description_tensor[i:i_end]

    # Extract data from the 'wine' DataFrame for metadata
    metadata_batch = wine.iloc[i:i_end]

    # Extract data from the batch
    ids = [f"{x['id']}" for _, x in metadata_batch.iterrows()]
    texts = [x['description'] for _, x in metadata_batch.iterrows()]

    metadata = [
        {
            'text': x['description'],
            'title': x['title'],
            'region': x['region_cleaned'],
            'winery': x['winery'],
            'variety': x['variety'],
            'province': x['province'],
            'price': x['price'],
            'designation': x['designation'],
            'country': x['country'],
            'style1': x['style1'],
            'style2': x['style2']
        }
        for _, x in metadata_batch.iterrows()
    ]

    # Extract the embeddings directly from the loaded tensor and convert to Python list
    embeds = batch.numpy().tolist()

    # Prepare the vectors as a list of tuples
    vectors = [(id, embed, meta) for id, embed, meta in zip(ids, embeds, metadata)]

    # Add the vectors to Pinecone using index.upsert()
    index.upsert(vectors=vectors)


100%|██████████| 2641/2641 [11:11<00:00,  3.93it/s]


Check the index stats:

In [22]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.7,
 'namespaces': {'': {'vector_count': 84502}},
 'total_vector_count': 84502}

# Querying with Pinecone

We can now query the indexed data like this. Here, we used the filter `"country": 'France'`, so only wines from France will appear.



In [36]:
query = "Fruity rich wine"

embedding = embed_model.embed_query(query)

result = index.query(
    vector=embedding,
    filter={
        "country": 'France'
    },
    top_k=5,
    include_metadata=True
)

The plain result looks like this:

In [37]:
result

{'matches': [{'id': '125349',
              'metadata': {'country': 'France',
                           'designation': "Domaine D'E Croce",
                           'price': '28.0',
                           'province': 'France Other',
                           'region': 'Patrimonio',
                           'style1': 'Rosé - France Other',
                           'style2': 'Rosé - France Other',
                           'text': 'Rich, fruity and soft, this wine has a '
                                   'tang of orange peel along with ripe citrus '
                                   'and red fruits. Fresh and crisp at the '
                                   'end, it is ready to drink.',
                           'title': "Yves Leccia 2015 Domaine D'E Croce Rosé "
                                    '(Patrimonio)',
                           'variety': 'Rosé',
                           'winery': 'Yves Leccia'},
              'score': 0.915218771,
              'values':

We can also make it look more presentable:

In [38]:
# Extract relevant information
data = [
    {
        'id': match['id'],
        'score': match['score'],
        **match['metadata']  # Unpack metadata into the dictionary
    }
    for match in result['matches']
]

# Create a DataFrame
result_df = pd.DataFrame(data)

# Display the DataFrame
result_df.head()

Unnamed: 0,id,score,country,designation,price,province,region,style1,style2,text,title,variety,winery
0,125349,0.915219,France,Domaine D'E Croce,28.0,France Other,Patrimonio,Rosé - France Other,Rosé - France Other,"Rich, fruity and soft, this wine has a tang of...",Yves Leccia 2015 Domaine D'E Croce Rosé (Patri...,Rosé,Yves Leccia
1,61437,0.913814,France,Château de Selle,43.0,Provence,Côtes de Provence,Rosé - Provence,Rosé - Provence,"A rich, fat wine, with the weight of concentra...",Domaines Ott 2010 Château de Selle Rosé (Côtes...,Rosé,Domaines Ott
2,113195,0.913025,France,Domaine de Michelons,unknown,Beaujolais,Moulin-à-Vent,Gamay - Beaujolais,Gamay - Beaujolais,"Rich, fruity wine that is full of black plum j...",Georges Duboeuf 2009 Domaine de Michelons (Mo...,Gamay,Georges Duboeuf
3,75652,0.912072,France,Cèdre Heritage,17.0,Southwest France,Cahors,Malbec - Southwest France,Malbec - Southwest France,"Rich and fruity, this is full of red berry and...",Château du Cèdre 2010 Cèdre Heritage Malbec (C...,Malbec,Château du Cèdre
4,64704,0.910887,France,La Patache,12.0,Bordeaux,Médoc,Bordeaux-style Red Blend - Bordeaux,Bordeaux-style Red Blend - Bordeaux,A rich wine laden with juicy blackberry fruits...,Domaines Lapalu 2009 La Patache (Médoc),Bordeaux-style Red Blend,Domaines Lapalu


# Querying with Faiss

In [28]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


Now load our tensor file and initiate a list of labels:

In [26]:
knn_vectors = torch.load("description_embeddings_openai_ada-002.pt")
knn_labels = wine.id.astype(str).tolist()

Now we can create a Faiss index using our tensor file:

In [39]:
import faiss

# Convert the data to NumPy arrays for use with faiss
vectors_np = knn_vectors.numpy()

# Determine the dimension of the vectors
dimension = vectors_np.shape[1]  # This represents the dimension of the vectors

# Build the Faiss index with cosine similarity
faiss_index = faiss.IndexFlatIP(dimension)  # Create a Faiss index with cosine similarity
faiss_index.add(vectors_np)  # Add the data vectors to the index


And we can query the indexed data like this:

In [40]:
import numpy as np

# Define the text query
query_text = "Fruity rich wine"

# Embed the text query to obtain a vector using the embedding model
query_vector = embed_model.embed_query(query_text)

# Specify the number of neighbors to return
k = 5

# Perform a nearest neighbor search to find the closest neighbors to the query vector(s).
# D will contain cosine similarities between the query vector and its neighbors.
# I will contain the indices of the nearest neighbors in the dataset.
D, I = faiss_index.search(np.array([query_vector]), k)

# Get labels of the neighbors
neighbor_labels = [knn_labels[i] for i in I[0]]

# Extract rows from the wine DataFrame based on the indices of the nearest neighbors
faiss_result = wine.iloc[I[0]]

The result is exactly the same as with Pinecone (without filtering) if you check (which is expected).

In [41]:
faiss_result

Unnamed: 0,id,country,description,designation,points,price,province,title,variety,winery,region_cleaned,style1,style2
81734,125349,France,"Rich, fruity and soft, this wine has a tang of...",Domaine D'E Croce,88.0,28.0,France Other,Yves Leccia 2015 Domaine D'E Croce Rosé (Patri...,Rosé,Yves Leccia,Patrimonio,Rosé - France Other,Rosé - France Other
15970,22632,Spain,Fruity is the best way to describe this young ...,Seis,88.0,16.0,Northern Spain,Paco Garcia 2011 Seis (Rioja),Tempranillo,Paco Garcia,Rioja,Tempranillo - Northern Spain,Tempranillo - Northern Spain
41779,61437,France,"A rich, fat wine, with the weight of concentra...",Château de Selle,90.0,43.0,Provence,Domaines Ott 2010 Château de Selle Rosé (Côtes...,Rosé,Domaines Ott,Côtes de Provence,Rosé - Provence,Rosé - Provence
28352,40854,US,"Fruity like grape jam and blackberry syrup, th...",Dark,87.0,10.0,California,Belle Ambiance 2015 Dark Red (California),Red Blend,Belle Ambiance,California,Red Blend - California,Red Blend - California
74371,113195,France,"Rich, fruity wine that is full of black plum j...",Domaine de Michelons,88.0,unknown,Beaujolais,Georges Duboeuf 2009 Domaine de Michelons (Mo...,Gamay,Georges Duboeuf,Moulin-à-Vent,Gamay - Beaujolais,Gamay - Beaujolais
