In [None]:
import os
from dotenv import load_dotenv
import csv
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
from pinecone.exceptions import PineconeApiException 
from langchain.embeddings.openai import OpenAIEmbeddings


In [None]:
## Get env variables
# Get the current working directory
current_directory = os.getcwd()
# Construct the path to the .env file in the parent directory if any
env_path = os.path.join(current_directory, '..', '.env')
# Load the environment variables from the .env file or just hardcode your keys here
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [None]:
def csv_to_dict(csv_file_path):
    """
    Convert a CSV file into a dictionary where the key is the first column.

    Args:
        csv_file_path (str): The path to the CSV file.

    Returns:
        dict: A dictionary representation of the CSV data with the first column as keys.
    """
    result_dict = {}
    
    # Open and read the CSV file
    with open(csv_file_path, mode='r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Convert each row into a dictionary item
        for row in reader:
            key = row[reader.fieldnames[0]]  # Use the first column as the key
            result_dict[key] = {field: row[field] for field in reader.fieldnames[1:]}  # Use remaining columns as values
    
    return result_dict

def dict_to_csv(data_dict, csv_file_path):
    """
    Convert a dictionary to a DataFrame and save it to a CSV file.

    Args:
        data_dict (dict): The dictionary to be converted to a CSV file.
        csv_file_path (str): The path where the CSV file will be saved.

    Returns:
        None
    """
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame.from_dict(data_dict, orient='index')
    
    # Save the DataFrame to a CSV file
    df.to_csv(csv_file_path)
    
    print(f"Data has been written to {csv_file_path}")

def select_pinecone_index(pc, index_name):

  # List current indexes to ensure the check is accurate
  current_indexes = pc.list_indexes()

  if index_name not in current_indexes:
      try:
          pc.create_index(
              name=index_name,
              dimension=1536, # 1536 dim of text-embedding-ada-002
              metric="cosine", # Replace with your model metric
              spec=ServerlessSpec(
                  cloud="aws",
                  region="us-east-1"
              ) 
          )
      except PineconeApiException as e:
          if "ALREADY_EXISTS" in str(e):
              print(f"Index '{index_name}' already exists.")
          else:
              raise
  else:
      print(f"Index '{index_name}' already exists.")

  return pc.Index(index_name)

def upsert_embeddings_to_pinecone(index, data_dict, batch_size=100):
    
    """
    Generate embeddings for each key in a dictionary and upsert them to a Pinecone index.

    Args:
        index (object): The Pinecone index to upsert vectors to.
        data_dict (dict): The input dictionary containing data. 
                          Each key should be the text to embed, 
                          and the value should contain an 'id' field.
        embed_function (function): The embedding function to use for generating embeddings.
        batch_size (int): The number of vectors to upsert in each batch. Default is 100.
    """
    # embeddings setup using OpenAI
    model_name = 'text-embedding-ada-002'
    embedder = OpenAIEmbeddings(
        model=model_name,
        openai_api_key=OPENAI_API_KEY
    )
    
    # Create lists to store embeddings, metadata, and IDs
    embeddings_list = []
    metadata_list = []
    ids = []

    # Generate embeddings for each key in the data dictionary
    for key in data_dict.keys():
        vector = embedder.embed_query(key)
        embeddings_list.append(vector)
        metadata_list.append({'name': key})  # Store the name as metadata
        ids.append(data_dict[key]['id'])

    # Batch and upsert embeddings with metadata
    for i in range(0, len(embeddings_list), batch_size):
        batch_ids = ids[i:i + batch_size]
        batch_vectors = embeddings_list[i:i + batch_size]
        batch_metadata = metadata_list[i:i + batch_size]
        
        # Combine ids, vectors, and metadata into a single iterable
        vectors_with_metadata = [
            (batch_ids[j], batch_vectors[j], batch_metadata[j])
            for j in range(len(batch_ids))
        ]
        
        # Upsert vectors with metadata to the Pinecone index
        index.upsert(vectors=vectors_with_metadata)

def find_closest_matches(source_index_name, target_index_name, data_dict, pinecone_client, top_k=1):
    """
    Query the Pinecone source index for each embedding and find the closest match in the target index.

    Args:
        source_index_name (str): The name of the Pinecone index containing the source embeddings.
        target_index_name (str): The name of the Pinecone index to search for matching vectors.
        data_dict (dict): A dictionary containing data with an 'id' field for each item.
        pinecone_client (object): The initialized Pinecone client object.
        top_k (int): The number of top matches to retrieve. Default is 1.

    Returns:
        dict: The updated data dictionary with matched vendor names and scores.
    """
    # Initialize indexes once outside the loop
    source_index = pinecone_client.Index(source_index_name)
    target_index = pinecone_client.Index(target_index_name)

    # Query the Pinecone index for each embedding and print the results
    for key, values in data_dict.items():
        # Get the item ID and fetch the corresponding vector from source index
        item_id = values.get('id')
        vector_data = source_index.fetch([item_id])['vectors'][item_id]

        # Get the vector values
        vector = vector_data['values']
        
        # Query the target index for the closest match
        matching_result = target_index.query(vector=vector, top_k=top_k, include_metadata=True)
        
        # Get the top match
        match = matching_result['matches'][0]
        
        # Extract metadata and score
        name = match['metadata']['name']
        score = match['score']
        
        # Update data_dict with the matching name and score
        data_dict[key]['matched_vendor'] = name
        data_dict[key]['score'] = score

        print(f"Closest match: {name} with a score of {score}")
    
    return data_dict


In [None]:
# Load csv files into dictionaries
csv_file_path = 'vendor_list_1.csv'
vendors_dict_source = csv_to_dict(csv_file_path)
csv_file_path = 'vendor_list_2.csv'
vendors_dict_target = csv_to_dict(csv_file_path)

In [None]:
# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
# Embed and upsert the source and target dictionaries to Pinecone
index_name_source = "vendors-source"
index = select_pinecone_index(pc, index_name_source)
upsert_embeddings_to_pinecone(index, vendors_dict_source, batch_size=100)
index_name_target = "vendors-target"
index = select_pinecone_index(pc, index_name_target)
upsert_embeddings_to_pinecone(index, vendors_dict_target, batch_size=100)


In [None]:
# Find the closest matches between the source and target vector databases
matches_dict = find_closest_matches(index_name_source, index_name_target, vendors_dict_source, pc)


In [None]:
# Save the matched vendors to a CSV file
dict_to_csv(matches_dict, 'matched_companies.csv')