In [10]:
pip install pinecone cohere



In [11]:
import cohere
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
import hashlib

In [12]:
def initialize_cohere(api_key):
    """
    Initialize the Cohere client.

    Args:
    api_key (str): Your Cohere API key.

    """
    return cohere.Client(api_key)

In [13]:
def initialize_pinecone(api_key, environment, index_name, dimension):
    """
    Initialize the Pinecone vector database and create an index if it doesn't exist.

    Args:
    api_key (str): Your Pinecone API key.
    environment (str): Pinecone environment region.
    index_name (str): Name of the Pinecone index.
    dimension (int): Dimension of the embeddings.

    Returns:
    pinecone.Index: The initialized Pinecone index.
    """
    pc = Pinecone(api_key=api_key)
    if index_name not in [idx.name for idx in pc.list_indexes()]:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(
                cloud=environment,
                region='us-east-1'
            )
        )
    return pc.Index(index_name)

In [14]:
# Generate embedding for text using Cohere
def generate_embedding(cohere_client, text, model="embed-english-v3.0"):
    """
    Generate an embedding for the given text using Cohere.

    Args:
    cohere_client (cohere.Client): The Cohere client.
    text (str): The input text to embed.
    model (str): The embedding model to use.

    Returns:
    list: The embedding vector.
    """
    response = cohere_client.embed(texts=[text],input_type="search_query",model=model)
    return response.embeddings[0]

In [15]:
def generate_id_from_question(question):
    """
    Generate a unique ID for a question using SHA-256 hashing.

    Args:
    question (str): The input question.

    Returns:
    str: A unique hash-based ID.
    """
    return hashlib.sha256(question.encode('utf-8')).hexdigest()

In [41]:
# Store embeddings in the Pinecone vector database
def store_embeddings(index, dataset, cohere_client, batch_size=100):
    """
    Store embeddings and metadata in the Pinecone vector database.

    Args:
    index (pinecone.Index): The Pinecone index.
    dataset (list of dict): Dataset containing 'question', 'prompt', 'answer', and 'priority'.
    cohere_client (cohere.Client): The Cohere client.
    """
    grouped = dataset.groupby('question')
    vectors = []
    for question, group in grouped:
        #select the lowest priority correct answer
        correct_answers = group[group['generated_answer'] == group['correct_answer']]
        if correct_answers.empty:
            continue  # Skip if no correct answers

        # Select the row with the lowest priority correct answer
        correct_row = correct_answers.sort_values('priority').iloc[0]
        #metadata
        metadata = {
            "question": question,  # Store the question as part of the metadata
            "strategy": correct_row['strategy'],
            "priority": int(correct_row['priority'])
        }
        embedding = generate_embedding(cohere_client, question)
        id = generate_id_from_question(question)
        vectors.append((id, embedding, metadata))
        if len(vectors) >= batch_size:
            index.upsert(vectors)
            vectors = []  # Reset the batch

    if vectors:
        index.upsert(vectors)

In [42]:
def retrieve_top_k_questions(index, cohere_client, new_question, top_k=5):
    """
    Retrieve the top-K most similar questions for a given new question.

    Args:
    index (pinecone.Index): The Pinecone index.
    cohere_client (cohere.Client): The Cohere client.
    new_question (str): The input question.
    top_k (int): Number of top similar questions to retrieve.

    Returns:
    list of dict: Retrieved questions with metadata and relevance scores.
    """
    # Generate embedding for the new question
    query_embedding = generate_embedding(cohere_client, new_question)

    # Query the vector database
    results = results = index.query(vector=query_embedding,top_k=5,include_metadata=True)

    # Extract and format results
    # Group by strategy to ensure diversity in retrieved strategies
    grouped_results = {}
    for match in results['matches']:
        strategy = match['metadata']['strategy']
        if strategy not in grouped_results:
            grouped_results[strategy] = {
                "question": match['metadata'].get('question', 'Unknown Question'),
                "strategy": strategy,
                "priority": match['metadata']['priority'],
                "score": match['score']
            }

    # Sort grouped results by score
    return sorted(grouped_results.values(), key=lambda x: x['score'], reverse=True)

In [44]:
def main():
    # Initialize clients
    cohere_api_key = "LWuVw2IhMyo9rccORONU0jNLYWXYFWGkMNQf69c6"  # Replace with your Cohere API key
    pinecone_api_key = "pcsk_2Aovig_CsVHa6sDHrnxX9YM97Ud1EsGW7hcvF1sTy3DNBMhvcouxrtAARup5mACrpMkAAe"  # Replace with your Pinecone API key
    pinecone_env = "aws"
    index_name = "ap-retrieval"
    embedding_dim = 1024

    cohere_client = initialize_cohere(cohere_api_key)
    index = initialize_pinecone(pinecone_api_key, pinecone_env, index_name, embedding_dim)

    csv_file_path = "/content/gsm8k_generated_dataset.csv"  # Replace with your file path
    dataset = pd.read_csv(csv_file_path)


    # Store embeddings
    store_embeddings(index, dataset, cohere_client, batch_size=100)

    # Retrieve strategies for a new question
    new_question = "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
    results = retrieve_top_k_questions(index, cohere_client, new_question)

    # Print the results
    print("Retrieved Questions:")
    for strategy in results:
        print(f"Question: {strategy['question']}, "
              f"Strategy: {strategy['strategy']}, "
              f"Priority: {strategy['priority']}, "
              f"Score: {strategy['score']}")



if __name__ == "__main__":
    main()


Retrieved Questions:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?, Strategy: few-shot, Priority: 1.0, Score: 1.00000012
Question: Lloyd has an egg farm. His chickens produce 252 eggs per day and he sells them for $2 per dozen. How much does Lloyd make on eggs per week?, Strategy: zero-shot, Priority: 0.0, Score: 0.635616422
Question: At the end of the school year, Kate asked her teacher if she could have the 3 boxes of 64 crayons since they were all worn down to small pieces.  The teacher agreed.  Kate took the crayons home and decided to make crayon muffins by melting 8 small pieces of crayons together in a muffin tin that she placed in an oven for a few minutes. If she can sell her muffin crayons for $1.50 each, how much money can Kate make?,