In [24]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [25]:
FIELD = 'summaries'
INDEX_NAME = 'arrowhead'

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')
split = 'train'
text_field = FIELD
rec_num = 2000
co = cohere.Client(api_key='uNvzNUIR9gCr5207LBlXzu2ox6Polr9Fyo4v6sDh')

# Load the dataset
dataset = load_dataset('pranjaljaiswal/arrowhead-bbc-news-summary', split='train')

# Embed the first ⁠ rec_num ⁠ rows of the dataset  
embeddings = model.encode(dataset[text_field][:rec_num])
shape = embeddings.shape

In [28]:
dataset_df = dataset.to_pandas()
dataset_df.head(5)

Unnamed: 0,articles,summaries
0,Summarize the following article: \nArgentina c...,Argentina is set to close its $102.6bn (Â£53.5...
1,Summarize the following article: \nRescue hope...,Shares in struggling German football club Boru...
2,Summarize the following article: \nSaudi NCCI'...,Shares in Saudi Arabia's National Company for ...
3,Summarize the following article: \nBrussels ra...,"Vodafone and O2, Britain's other big mobile ph..."
4,Summarize the following article: \nFresh hope ...,"Mr Valentine, who was born in the United State..."


In [54]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key='a60f3c11-0717-4de9-81eb-3cf2ae57d103')
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc


def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = FIELD,
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


def cohere_response(query, cohere_client=co):
    response = co.chat(
            model='command-r-plus',
            message=f'{query} keep your answer consise.',
        )
    return response.text


def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=5,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata'][FIELD] for match in query_results[:2]]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge


def compare_responses(query, index):
    augmented_prompt, source_knowledge = augment_prompt(query=query, index=index)

    llm_response = cohere_response(query)
    rag_response = cohere_response(augmented_prompt)
    chunks = source_knowledge.split('\n\n')[:2]

    print(f'Query: {query}')
    print(f'Original LLM rasponse: {llm_response}')
    print(f'RAG response: {rag_response}')
    print(f'RAG source {chunks}')
    

    

In [55]:
pc = create_pinecone_index(INDEX_NAME, shape[1])

# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:05<00:00,  2.86it/s]


In [56]:
queries = [
    'how many nominations for Golden Globe awards US actor Jamie Foxx has been given?',
    'what is the donation of the US bank to victims of former Chilean military ruler Augusto Pinochets?',
    'What is the value of the previously frozen assets Libya has pulled from the US?'
]

for query in queries:
    compare_responses(query, index)
    print('------------------------------------------------------------------------')

Query: how many nominations for Golden Globe awards US actor Jamie Foxx has been given?
Original LLM rasponse: Jamie Foxx has received five Golden Globe Award nominations.
RAG response: Two.
RAG source ["US actor Jamie Foxx has been given two nominations for Golden Globe awards, with Meryl Streep, Morgan Freeman and Cate Blanchett also up for prizes.The Golden Globes, Hollywood's second most prominent awards, are the first major nominations to be announced.The Golden Globes ceremony will take place on 16 January, with the Oscars following on 27 February.Clive Owen, David Carradine and Natalie Portman are also up for awards.", "Oscar nominees Swank and Foxx were among the winners at the Screen Actors Guild awards at the weekend, one of the many ceremonies held in the run-up to the Oscars.Many of those nominated for Oscars including DiCaprio, Foxx and Staunton - an Oscar nominee for her performance in Vera Drake - have also been nominated for Baftas.Leonardo DiCaprio, Jamie Foxx and Hila