In [None]:
!pip install -qU \
  transformers==4.31.0 \
  pinecone-client==2.2.3 \
  openai==1.3.2 \
  tiktoken==0.5.1 \
  langchain==0.0.336

# Creating Embeddings with an OpenAI Model

Load our dataset:

In [None]:
import pandas as pd

wine = pd.read_csv('wine_cleaned_rev_concat.csv')

# Also fill NaN values, because Pinecone cannot handle NaNs in the metadata
wine.fillna("unknown", inplace=True)

# Rename the 'region_cleaned' column to 'region'
wine.rename(columns={'region_cleaned': 'region'}, inplace=True)


Add a new column that contains the information in all columns, for every wine:

In [None]:
# Create a copy of the DataFrame to avoid modifying the original
wine_info = wine.copy()

# Define a function to concatenate information from all columns
def concatenate_info(row):
    # Exclude the "id" column
    info_columns = [col for col in wine.columns if col != "id"]

    # Create a string with the desired format
    info_string = ", ".join([f"{col.capitalize()}: {row[col]}" for col in info_columns])

    return info_string

# Apply the function to create the new column
wine_info["info"] = wine_info.apply(concatenate_info, axis=1)

Here's what a single concatenated info entry looks like:

In [None]:
wine_info.iloc[2236]['info']

"Country: US, Description: Creamy, lush and somewhat robust, this dry sparkler offers a thick mousse and touch of richness in its mix of nutty bread dough, green apple, pear and lime. Made predominantly from Pinot Noir, this is an easygoing, approachable sparkling wine, offering dry creamy layers of green apple, pear and strawberry. It also has a rich inviting mousse embedded with vanilla and marzipan. A wonderfully drinkable sparkling wine that appeals immediately for its balanced texture and rich flavors. Easy to like for its array of raspberries, limes, vanilla, toast and yeast. The sour lees note is nicely balanced with the sweet liqueur of dosage. Made from mostly Pinot Noir grapes, with an addition of 8.8% Chardonnay, this Méthode Champenoise wine is sultry in raspberry cream and crème brûlée, with enough acidity to keep it fresh in the glass. The finish offers a taste of toast with ginger jam. This wine shows lots of finesse for the price. The mousse is exceptionally refined, ca

We will use the `text-embedding-ada-002` embedding model from OpenAI. The maximum number of tokens that it can handle is 8192. Let's check if any of our descriptions has more than 8192 tokens. If not, then no chunking will be necessary.

In [None]:
# Find the index of the row with the longest 'info' column
max_info_index = wine_info['info'].apply(len).idxmax()

# Get the row with the longest 'info'
row_with_longest_info = wine_info.loc[max_info_index]

# Print or use the row as needed
print(len(row_with_longest_info['info']))


2215


The longest description in terms of characters is 2215. It's more or less clear that there will be many fewer tokens than that in the longest description, but let's still check this.

In [None]:
import os
import openai

# get API key from OpenAI website
OPENAI_API_KEY = "OPENAI_API_KEY"

openai.api_key = os.getenv("OPENAI_API_KEY") or OPENAI_API_KEY

Import the tiktoken tokenizer:

In [None]:
import tiktoken

tiktoken.encoding_for_model('gpt-4-1106-preview')

<Encoding 'cl100k_base'>

Define the function that outputs the token length of a given string:

In [None]:
tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("Hi, I'm a piece of text.")

9

See what the token length of our longest description is:

In [None]:
tiktoken_len(row_with_longest_info['info'])

527

This is much smaller than 8192, so we don't need to split our descriptions into chunks. Now let's create embeddings for every description and save them to a tensor file. First, define the embedding model:

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed_model = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

See how it works:

In [None]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 1536.


Now we're ready to create a tensor file:

In [None]:
import math
import torch
from tqdm import tqdm

# Extract the "info" column as a list from wine_info
infos = wine_info['info'].tolist()

# Set batch size
batch_size = 100

# Calculate the number of batches needed
num_batches = math.ceil(len(infos) / batch_size)

# Initialize a list to store embeddings
info_embeddings = []

# Process data in batches with tqdm
for i in tqdm(range(num_batches), desc="Processing Batches"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(infos))
    batch_infos = infos[start_idx:end_idx]

    # Encode the batch of infos using the model
    batch_embeddings = embed_model.embed_documents(batch_infos)

    # Append the batch embeddings to the list
    info_embeddings.extend(batch_embeddings)

# Convert the list of embeddings to a PyTorch tensor
info_tensor = torch.tensor(info_embeddings)

# Save the tensor to a file
torch.save(info_tensor, 'info_embeddings_openai_ada-002_updated.pt')

Processing Batches: 100%|██████████| 846/846 [20:56<00:00,  1.49s/it]


# Sending Embeddings and Metadata to Pinecone

We can now send these embeddings, along with the corresponding metadata, to Pinecone. First, initialize access to Pinecone:

In [None]:
import os
import pinecone
from tqdm import tqdm

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'
)

  from tqdm.autonotebook import tqdm


See what indexes exist already:

In [None]:
pinecone.list_indexes()

[]

Create a new index (if not already):

In [None]:
import time

index_name = 'rag-openai-combined-updated'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

Now connect to the index:

In [None]:
# Get an instance of the Pinecone Index with the specified index name
index = pinecone.Index(index_name)

# Retrieve and display statistics about the Pinecone index
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Use the tensor file to send embeddings for 'description' to Pinecone. Also include the corresponding pieces of metadata:

In [None]:
import torch
from tqdm import tqdm

# Load the PyTorch tensor from the tensor file
info_embeddings_tensor = torch.load('info_embeddings_openai_ada-002_updated.pt')

# Define the batch size
batch_size = 32

# Create a loop to process data in batches
for i in tqdm(range(0, len(info_embeddings_tensor), batch_size)):
    i_end = min(len(info_embeddings_tensor), i + batch_size)
    batch = info_embeddings_tensor[i:i_end]

    # Extract data from the 'wine_info' DataFrame for metadata
    metadata_batch = wine_info.iloc[i:i_end].reset_index(drop=True)

    # Extract data from the batch
    # Using 'id' column as the ID
    metadata_batch['id'] = metadata_batch['id'].astype(str)  # Convert 'id' to string for serialization
    ids = metadata_batch['id'].tolist()

    # Include all columns as metadata
    metadata = [
        {col: metadata_batch.at[idx, col] for col in wine_info.columns}
        for idx in range(len(metadata_batch))
    ]

    # Extract the embeddings directly from the loaded tensor and convert to Python list
    embeds = batch.numpy().tolist()

    # Prepare the vectors as a list of tuples
    vectors = [(id, embed, meta) for id, embed, meta in zip(ids, embeds, metadata)]

    # Add the vectors to Pinecone using index.upsert()
    index.upsert(vectors=vectors)


100%|██████████| 2641/2641 [14:52<00:00,  2.96it/s]


Check the index stats:

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.9,
 'namespaces': {'': {'vector_count': 84502}},
 'total_vector_count': 84502}

# Querying with Pinecone

We can now query the indexed data like this. Here, we used the filter `"country": 'France'`, so only wines from France will appear.



In [None]:
query = "Oaky dry wine"

embedding = embed_model.embed_query(query)

result = index.query(
    vector=embedding,
    filter={
        "country": 'France'
    },
    top_k=5,
    include_metadata=True
)

The plain result looks like this:

In [None]:
result

{'matches': [{'id': '18993',
              'metadata': {'country': 'France',
                           'description': 'Dry, firm and tough, this wine '
                                          'lacks the fruit fruit to come '
                                          "through the structure. It's a wine "
                                          'for those who like plenty of wood.',
                           'designation': 'Excellence',
                           'id': '18993',
                           'info': 'Country: France, Description: Dry, firm '
                                   'and tough, this wine lacks the fruit fruit '
                                   "to come through the structure. It's a wine "
                                   'for those who like plenty of wood., '
                                   'Designation: Excellence, Points: 84.0, '
                                   'Price: 16.0, Province: Bordeaux, Title: '
                                   'Les Vigne

We can also make it look more presentable:

In [None]:
# Extract relevant information
data = [
    {
        'id': match['id'],
        'score': match['score'],
        **match['metadata']  # Unpack metadata into the dictionary
    }
    for match in result['matches']
]

# Create a DataFrame
result_df = pd.DataFrame(data)

# Display the DataFrame
result_df

Unnamed: 0,id,score,country,description,designation,info,points,price,province,region,style1,style2,style3,title,variety,winery
0,18993,0.857618,France,"Dry, firm and tough, this wine lacks the fruit...",Excellence,"Country: France, Description: Dry, firm and to...",84.0,16.0,Bordeaux,Blaye Côtes de Bordeaux,Bordeaux-style Red Blend - Bordeaux,Bordeaux-style Red Blend - Bordeaux,Bordeaux-style Red Blend - France,Les Vignerons de Tutiac 2015 Excellence (Blay...,Bordeaux-style Red Blend,Les Vignerons de Tutiac
1,43607,0.853897,France,"Oaky notes of buttered toast, dried apple slic...",Reserve,"Country: France, Description: Oaky notes of bu...",84.0,10.0,France Other,Vin de France,Chardonnay - France Other,Chardonnay - France Other,Chardonnay - France,Phantom Bay 2014 Reserve Chardonnay (Vin de Fr...,Chardonnay,Phantom Bay
2,58845,0.853261,France,"This wine is bone-dry, although with some age ...",Le Nombre d'Or Brut Nature,"Country: France, Description: This wine is bon...",91.0,85.0,Champagne,Champagne,Chardonnay - Champagne,Chardonnay - Champagne,Chardonnay - France,Aubry 2009 Le Nombre d'Or Brut Nature Chardonn...,Chardonnay,Aubry
3,88331,0.852645,France,"Yes, it's dry, very dry. However, this well-ma...",Zéro Dosage Brut Nature,"Country: France, Description: Yes, it's dry, v...",90.0,unknown,Champagne,Champagne,Champagne Blend - Champagne,Champagne Blend - Champagne,Champagne Blend - France,Nicolas Maillart NV Zéro Dosage Brut Nature (...,Champagne Blend,Nicolas Maillart
4,21394,0.85117,France,90—92 Barrel sample. Dry botrytis is the theme...,Barrel sample,"Country: France, Description: 90—92 Barrel sam...",91.0,unknown,Bordeaux,Barsac,Bordeaux-style White Blend - Bordeaux,Bordeaux-style White Blend - Bordeaux,Bordeaux-style White Blend - France,Château Suau 2006 Barrel sample (Barsac),Bordeaux-style White Blend,Château Suau


# Querying with Faiss

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


Now load our tensor file and initiate a list of labels:

In [None]:
# Load the KNN vectors
knn_vectors = torch.load("info_embeddings_openai_ada-002_updated.pt")

# Use the 'id' column as labels in KNN
knn_labels = wine_info['id'].tolist()

Now we can create a Faiss index using our tensor file:

In [None]:
import faiss

# Convert the data to NumPy arrays for use with faiss
vectors_np = knn_vectors.numpy()

# Determine the dimension of the vectors
dimension = vectors_np.shape[1]  # This represents the dimension of the vectors

# Build the Faiss index with cosine similarity
faiss_index = faiss.IndexFlatIP(dimension)  # Create a Faiss index with cosine similarity
faiss_index.add(vectors_np)  # Add the data vectors to the index


And we can query the indexed data like this:

In [None]:
import numpy as np

# Define the text query
query_text = "Oaky dry wine"

# Embed the text query to obtain a vector using the embedding model
query_vector = embed_model.embed_query(query_text)

# Specify the number of neighbors to return
k = 5

# Perform a nearest neighbor search to find the closest neighbors to the query vector(s).
# D will contain cosine similarities between the query vector and its neighbors.
# I will contain the indices of the nearest neighbors in the dataset.
D, I = faiss_index.search(np.array([query_vector]), k)

# Get labels of the neighbors
neighbor_labels = [knn_labels[i] for i in I[0]]

# Extract rows from the wine DataFrame based on the indices of the nearest neighbors
faiss_result = wine.iloc[I[0]]

Here's the result:

In [None]:
faiss_result

Unnamed: 0,id,country,description,designation,points,price,province,title,variety,winery,region,style1,style2,style3
25402,36480,US,Oaky bacon and barbecue smoke make for a burly...,Estate Grown,85.0,25.0,California,Fritz 2013 Estate Grown Zinfandel (Dry Creek V...,Zinfandel,Fritz,Dry Creek Valley,Zinfandel - California,Zinfandel - Dry Creek Valley,Zinfandel - Dry Creek Valley
29515,42685,US,Dry indeed; this is so dry as to be outright s...,Dry,84.0,14.0,Oregon,Willamette Valley Vineyards 2008 Dry Riesling ...,Riesling,Willamette Valley Vineyards,Willamette Valley,Riesling - Oregon,Riesling - Willamette Valley,Riesling - Willamette Valley
33851,49136,US,"Absolutely dry and tingly in acidity, this cit...",Reserve,84.0,8.0,California,Oak Grove 2009 Reserve Pinot Grigio (California),Pinot Grigio,Oak Grove,California,Pinot Grigio - California,Pinot Grigio - California,Pinot Grigio - California
25210,36191,US,"Light and crisp, it offers layers of lemon chi...",Foggy Oaks,88.0,20.0,California,Dry Creek Vineyard 2011 Foggy Oaks Chardonnay ...,Chardonnay,Dry Creek Vineyard,Russian River Valley,Chardonnay - California,Chardonnay - Russian River Valley,Chardonnay - Russian River Valley
32975,47814,US,"Like the label says, this is a bone-dry wine. ...",Dry,89.0,20.0,California,Robert Mondavi 2011 Dry Fumé Blanc (Napa Valley),Fumé Blanc,Robert Mondavi,Napa Valley,Fumé Blanc - California,Fumé Blanc - Napa Valley,Fumé Blanc - Napa Valley
